1 /*
    2  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
    3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4  *
    5  * This code is free software; you can redistribute it and/or modify it
    6  * under the terms of the GNU General Public License version 2 only, as
    7  * published by the Free Software Foundation.
    8  *
    9  * This code is distributed in the hope that it will be useful, but WITHOUT
   10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   12  * version 2 for more details (a copy is included in the LICENSE file that
   13  * accompanied this code).
   14  *
   15  * You should have received a copy of the GNU General Public License version
   16  * 2 along with this work; if not, write to the Free Software Foundation,
   17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   18  *
   19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   20  * or visit www.oracle.com if you need additional information or have any
   21  * questions.
   22  *
   23  */
   24 
   25 #include "precompiled.hpp"
   26 #include "jvm.h"
   27 #include "asm/assembler.hpp"
   28 #include "asm/assembler.inline.hpp"
   29 #include "compiler/disassembler.hpp"
   30 #include "gc/shared/cardTable.hpp"
   31 #include "gc/shared/cardTableModRefBS.hpp"
   32 #include "gc/shared/collectedHeap.inline.hpp"
   33 #include "interpreter/interpreter.hpp"
   34 #include "memory/resourceArea.hpp"
   35 #include "memory/universe.hpp"
   36 #include "oops/klass.inline.hpp"
   37 #include "prims/methodHandles.hpp"
   38 #include "runtime/biasedLocking.hpp"
   39 #include "runtime/interfaceSupport.hpp"
   40 #include "runtime/objectMonitor.hpp"
   41 #include "runtime/os.hpp"
   42 #include "runtime/safepoint.hpp"
   43 #include "runtime/safepointMechanism.hpp"
   44 #include "runtime/sharedRuntime.hpp"
   45 #include "runtime/stubRoutines.hpp"
   46 #include "runtime/thread.hpp"
   47 #include "utilities/macros.hpp"
   48 #if INCLUDE_ALL_GCS
   49 #include "gc/g1/g1BarrierSet.hpp"
   50 #include "gc/g1/g1CardTable.hpp"
   51 #include "gc/g1/g1CollectedHeap.inline.hpp"
   52 #include "gc/g1/heapRegion.hpp"
   53 #endif // INCLUDE_ALL_GCS
   54 #include "crc32c.h"
   55 #ifdef COMPILER2
   56 #include "opto/intrinsicnode.hpp"
   57 #endif
   58 
   59 #ifdef PRODUCT
   60 #define BLOCK_COMMENT(str) /* nothing */
   61 #define STOP(error) stop(error)
   62 #else
   63 #define BLOCK_COMMENT(str) block_comment(str)
   64 #define STOP(error) block_comment(error); stop(error)
   65 #endif
   66 
   67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   68 
   69 #ifdef ASSERT
   70 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
   71 #endif
   72 
   73 static Assembler::Condition reverse[] = {
   74     Assembler::noOverflow     /* overflow      = 0x0 */ ,
   75     Assembler::overflow       /* noOverflow    = 0x1 */ ,
   76     Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
   77     Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
   78     Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
   79     Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
   80     Assembler::above          /* belowEqual    = 0x6 */ ,
   81     Assembler::belowEqual     /* above         = 0x7 */ ,
   82     Assembler::positive       /* negative      = 0x8 */ ,
   83     Assembler::negative       /* positive      = 0x9 */ ,
   84     Assembler::noParity       /* parity        = 0xa */ ,
   85     Assembler::parity         /* noParity      = 0xb */ ,
   86     Assembler::greaterEqual   /* less          = 0xc */ ,
   87     Assembler::less           /* greaterEqual  = 0xd */ ,
   88     Assembler::greater        /* lessEqual     = 0xe */ ,
   89     Assembler::lessEqual      /* greater       = 0xf, */
   90 
   91 };
   92 
   93 
   94 // Implementation of MacroAssembler
   95 
   96 // First all the versions that have distinct versions depending on 32/64 bit
   97 // Unless the difference is trivial (1 line or so).
   98 
   99 #ifndef _LP64
  100 
  101 // 32bit versions
  102 
  103 Address MacroAssembler::as_Address(AddressLiteral adr) {
  104   return Address(adr.target(), adr.rspec());
  105 }
  106 
  107 Address MacroAssembler::as_Address(ArrayAddress adr) {
  108   return Address::make_array(adr);
  109 }
  110 
  111 void MacroAssembler::call_VM_leaf_base(address entry_point,
  112                                        int number_of_arguments) {
  113   call(RuntimeAddress(entry_point));
  114   increment(rsp, number_of_arguments * wordSize);
  115 }
  116 
  117 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
  118   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
  119 }
  120 
  121 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
  122   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
  123 }
  124 
  125 void MacroAssembler::cmpoop(Address src1, jobject obj) {
  126   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
  127 }
  128 
  129 void MacroAssembler::cmpoop(Register src1, jobject obj) {
  130   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
  131 }
  132 
  133 void MacroAssembler::extend_sign(Register hi, Register lo) {
  134   // According to Intel Doc. AP-526, "Integer Divide", p.18.
  135   if (VM_Version::is_P6() && hi == rdx && lo == rax) {
  136     cdql();
  137   } else {
  138     movl(hi, lo);
  139     sarl(hi, 31);
  140   }
  141 }
  142 
  143 void MacroAssembler::jC2(Register tmp, Label& L) {
  144   // set parity bit if FPU flag C2 is set (via rax)
  145   save_rax(tmp);
  146   fwait(); fnstsw_ax();
  147   sahf();
  148   restore_rax(tmp);
  149   // branch
  150   jcc(Assembler::parity, L);
  151 }
  152 
  153 void MacroAssembler::jnC2(Register tmp, Label& L) {
  154   // set parity bit if FPU flag C2 is set (via rax)
  155   save_rax(tmp);
  156   fwait(); fnstsw_ax();
  157   sahf();
  158   restore_rax(tmp);
  159   // branch
  160   jcc(Assembler::noParity, L);
  161 }
  162 
  163 // 32bit can do a case table jump in one instruction but we no longer allow the base
  164 // to be installed in the Address class
  165 void MacroAssembler::jump(ArrayAddress entry) {
  166   jmp(as_Address(entry));
  167 }
  168 
  169 // Note: y_lo will be destroyed
  170 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
  171   // Long compare for Java (semantics as described in JVM spec.)
  172   Label high, low, done;
  173 
  174   cmpl(x_hi, y_hi);
  175   jcc(Assembler::less, low);
  176   jcc(Assembler::greater, high);
  177   // x_hi is the return register
  178   xorl(x_hi, x_hi);
  179   cmpl(x_lo, y_lo);
  180   jcc(Assembler::below, low);
  181   jcc(Assembler::equal, done);
  182 
  183   bind(high);
  184   xorl(x_hi, x_hi);
  185   increment(x_hi);
  186   jmp(done);
  187 
  188   bind(low);
  189   xorl(x_hi, x_hi);
  190   decrementl(x_hi);
  191 
  192   bind(done);
  193 }
  194 
  195 void MacroAssembler::lea(Register dst, AddressLiteral src) {
  196     mov_literal32(dst, (int32_t)src.target(), src.rspec());
  197 }
  198 
  199 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
  200   // leal(dst, as_Address(adr));
  201   // see note in movl as to why we must use a move
  202   mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
  203 }
  204 
  205 void MacroAssembler::leave() {
  206   mov(rsp, rbp);
  207   pop(rbp);
  208 }
  209 
  210 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
  211   // Multiplication of two Java long values stored on the stack
  212   // as illustrated below. Result is in rdx:rax.
  213   //
  214   // rsp ---> [  ??  ] \               \
  215   //            ....    | y_rsp_offset  |
  216   //          [ y_lo ] /  (in bytes)    | x_rsp_offset
  217   //          [ y_hi ]                  | (in bytes)
  218   //            ....                    |
  219   //          [ x_lo ]                 /
  220   //          [ x_hi ]
  221   //            ....
  222   //
  223   // Basic idea: lo(result) = lo(x_lo * y_lo)
  224   //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
  225   Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
  226   Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
  227   Label quick;
  228   // load x_hi, y_hi and check if quick
  229   // multiplication is possible
  230   movl(rbx, x_hi);
  231   movl(rcx, y_hi);
  232   movl(rax, rbx);
  233   orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
  234   jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
  235   // do full multiplication
  236   // 1st step
  237   mull(y_lo);                                    // x_hi * y_lo
  238   movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
  239   // 2nd step
  240   movl(rax, x_lo);
  241   mull(rcx);                                     // x_lo * y_hi
  242   addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
  243   // 3rd step
  244   bind(quick);                                   // note: rbx, = 0 if quick multiply!
  245   movl(rax, x_lo);
  246   mull(y_lo);                                    // x_lo * y_lo
  247   addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
  248 }
  249 
  250 void MacroAssembler::lneg(Register hi, Register lo) {
  251   negl(lo);
  252   adcl(hi, 0);
  253   negl(hi);
  254 }
  255 
  256 void MacroAssembler::lshl(Register hi, Register lo) {
  257   // Java shift left long support (semantics as described in JVM spec., p.305)
  258   // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
  259   // shift value is in rcx !
  260   assert(hi != rcx, "must not use rcx");
  261   assert(lo != rcx, "must not use rcx");
  262   const Register s = rcx;                        // shift count
  263   const int      n = BitsPerWord;
  264   Label L;
  265   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
  266   cmpl(s, n);                                    // if (s < n)
  267   jcc(Assembler::less, L);                       // else (s >= n)
  268   movl(hi, lo);                                  // x := x << n
  269   xorl(lo, lo);
  270   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
  271   bind(L);                                       // s (mod n) < n
  272   shldl(hi, lo);                                 // x := x << s
  273   shll(lo);
  274 }
  275 
  276 
  277 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
  278   // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
  279   // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
  280   assert(hi != rcx, "must not use rcx");
  281   assert(lo != rcx, "must not use rcx");
  282   const Register s = rcx;                        // shift count
  283   const int      n = BitsPerWord;
  284   Label L;
  285   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
  286   cmpl(s, n);                                    // if (s < n)
  287   jcc(Assembler::less, L);                       // else (s >= n)
  288   movl(lo, hi);                                  // x := x >> n
  289   if (sign_extension) sarl(hi, 31);
  290   else                xorl(hi, hi);
  291   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
  292   bind(L);                                       // s (mod n) < n
  293   shrdl(lo, hi);                                 // x := x >> s
  294   if (sign_extension) sarl(hi);
  295   else                shrl(hi);
  296 }
  297 
  298 void MacroAssembler::movoop(Register dst, jobject obj) {
  299   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
  300 }
  301 
  302 void MacroAssembler::movoop(Address dst, jobject obj) {
  303   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
  304 }
  305 
  306 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
  307   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
  308 }
  309 
  310 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
  311   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
  312 }
  313 
  314 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
  315   // scratch register is not used,
  316   // it is defined to match parameters of 64-bit version of this method.
  317   if (src.is_lval()) {
  318     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
  319   } else {
  320     movl(dst, as_Address(src));
  321   }
  322 }
  323 
  324 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
  325   movl(as_Address(dst), src);
  326 }
  327 
  328 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
  329   movl(dst, as_Address(src));
  330 }
  331 
  332 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
  333 void MacroAssembler::movptr(Address dst, intptr_t src) {
  334   movl(dst, src);
  335 }
  336 
  337 
  338 void MacroAssembler::pop_callee_saved_registers() {
  339   pop(rcx);
  340   pop(rdx);
  341   pop(rdi);
  342   pop(rsi);
  343 }
  344 
  345 void MacroAssembler::pop_fTOS() {
  346   fld_d(Address(rsp, 0));
  347   addl(rsp, 2 * wordSize);
  348 }
  349 
  350 void MacroAssembler::push_callee_saved_registers() {
  351   push(rsi);
  352   push(rdi);
  353   push(rdx);
  354   push(rcx);
  355 }
  356 
  357 void MacroAssembler::push_fTOS() {
  358   subl(rsp, 2 * wordSize);
  359   fstp_d(Address(rsp, 0));
  360 }
  361 
  362 
  363 void MacroAssembler::pushoop(jobject obj) {
  364   push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
  365 }
  366 
  367 void MacroAssembler::pushklass(Metadata* obj) {
  368   push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
  369 }
  370 
  371 void MacroAssembler::pushptr(AddressLiteral src) {
  372   if (src.is_lval()) {
  373     push_literal32((int32_t)src.target(), src.rspec());
  374   } else {
  375     pushl(as_Address(src));
  376   }
  377 }
  378 
  379 void MacroAssembler::set_word_if_not_zero(Register dst) {
  380   xorl(dst, dst);
  381   set_byte_if_not_zero(dst);
  382 }
  383 
  384 static void pass_arg0(MacroAssembler* masm, Register arg) {
  385   masm->push(arg);
  386 }
  387 
  388 static void pass_arg1(MacroAssembler* masm, Register arg) {
  389   masm->push(arg);
  390 }
  391 
  392 static void pass_arg2(MacroAssembler* masm, Register arg) {
  393   masm->push(arg);
  394 }
  395 
  396 static void pass_arg3(MacroAssembler* masm, Register arg) {
  397   masm->push(arg);
  398 }
  399 
  400 #ifndef PRODUCT
  401 extern "C" void findpc(intptr_t x);
  402 #endif
  403 
  404 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
  405   // In order to get locks to work, we need to fake a in_VM state
  406   JavaThread* thread = JavaThread::current();
  407   JavaThreadState saved_state = thread->thread_state();
  408   thread->set_thread_state(_thread_in_vm);
  409   if (ShowMessageBoxOnError) {
  410     JavaThread* thread = JavaThread::current();
  411     JavaThreadState saved_state = thread->thread_state();
  412     thread->set_thread_state(_thread_in_vm);
  413     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
  414       ttyLocker ttyl;
  415       BytecodeCounter::print();
  416     }
  417     // To see where a verify_oop failed, get $ebx+40/X for this frame.
  418     // This is the value of eip which points to where verify_oop will return.
  419     if (os::message_box(msg, "Execution stopped, print registers?")) {
  420       print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
  421       BREAKPOINT;
  422     }
  423   } else {
  424     ttyLocker ttyl;
  425     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
  426   }
  427   // Don't assert holding the ttyLock
  428     assert(false, "DEBUG MESSAGE: %s", msg);
  429   ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
  430 }
  431 
  432 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
  433   ttyLocker ttyl;
  434   FlagSetting fs(Debugging, true);
  435   tty->print_cr("eip = 0x%08x", eip);
  436 #ifndef PRODUCT
  437   if ((WizardMode || Verbose) && PrintMiscellaneous) {
  438     tty->cr();
  439     findpc(eip);
  440     tty->cr();
  441   }
  442 #endif
  443 #define PRINT_REG(rax) \
  444   { tty->print("%s = ", #rax); os::print_location(tty, rax); }
  445   PRINT_REG(rax);
  446   PRINT_REG(rbx);
  447   PRINT_REG(rcx);
  448   PRINT_REG(rdx);
  449   PRINT_REG(rdi);
  450   PRINT_REG(rsi);
  451   PRINT_REG(rbp);
  452   PRINT_REG(rsp);
  453 #undef PRINT_REG
  454   // Print some words near top of staack.
  455   int* dump_sp = (int*) rsp;
  456   for (int col1 = 0; col1 < 8; col1++) {
  457     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
  458     os::print_location(tty, *dump_sp++);
  459   }
  460   for (int row = 0; row < 16; row++) {
  461     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
  462     for (int col = 0; col < 8; col++) {
  463       tty->print(" 0x%08x", *dump_sp++);
  464     }
  465     tty->cr();
  466   }
  467   // Print some instructions around pc:
  468   Disassembler::decode((address)eip-64, (address)eip);
  469   tty->print_cr("--------");
  470   Disassembler::decode((address)eip, (address)eip+32);
  471 }
  472 
  473 void MacroAssembler::stop(const char* msg) {
  474   ExternalAddress message((address)msg);
  475   // push address of message
  476   pushptr(message.addr());
  477   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
  478   pusha();                                            // push registers
  479   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
  480   hlt();
  481 }
  482 
  483 void MacroAssembler::warn(const char* msg) {
  484   push_CPU_state();
  485 
  486   ExternalAddress message((address) msg);
  487   // push address of message
  488   pushptr(message.addr());
  489 
  490   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
  491   addl(rsp, wordSize);       // discard argument
  492   pop_CPU_state();
  493 }
  494 
  495 void MacroAssembler::print_state() {
  496   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
  497   pusha();                                            // push registers
  498 
  499   push_CPU_state();
  500   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
  501   pop_CPU_state();
  502 
  503   popa();
  504   addl(rsp, wordSize);
  505 }
  506 
  507 #else // _LP64
  508 
  509 // 64 bit versions
  510 
  511 Address MacroAssembler::as_Address(AddressLiteral adr) {
  512   // amd64 always does this as a pc-rel
  513   // we can be absolute or disp based on the instruction type
  514   // jmp/call are displacements others are absolute
  515   assert(!adr.is_lval(), "must be rval");
  516   assert(reachable(adr), "must be");
  517   return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
  518 
  519 }
  520 
  521 Address MacroAssembler::as_Address(ArrayAddress adr) {
  522   AddressLiteral base = adr.base();
  523   lea(rscratch1, base);
  524   Address index = adr.index();
  525   assert(index._disp == 0, "must not have disp"); // maybe it can?
  526   Address array(rscratch1, index._index, index._scale, index._disp);
  527   return array;
  528 }
  529 
  530 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
  531   Label L, E;
  532 
  533 #ifdef _WIN64
  534   // Windows always allocates space for it's register args
  535   assert(num_args <= 4, "only register arguments supported");
  536   subq(rsp,  frame::arg_reg_save_area_bytes);
  537 #endif
  538 
  539   // Align stack if necessary
  540   testl(rsp, 15);
  541   jcc(Assembler::zero, L);
  542 
  543   subq(rsp, 8);
  544   {
  545     call(RuntimeAddress(entry_point));
  546   }
  547   addq(rsp, 8);
  548   jmp(E);
  549 
  550   bind(L);
  551   {
  552     call(RuntimeAddress(entry_point));
  553   }
  554 
  555   bind(E);
  556 
  557 #ifdef _WIN64
  558   // restore stack pointer
  559   addq(rsp, frame::arg_reg_save_area_bytes);
  560 #endif
  561 
  562 }
  563 
  564 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
  565   assert(!src2.is_lval(), "should use cmpptr");
  566 
  567   if (reachable(src2)) {
  568     cmpq(src1, as_Address(src2));
  569   } else {
  570     lea(rscratch1, src2);
  571     Assembler::cmpq(src1, Address(rscratch1, 0));
  572   }
  573 }
  574 
  575 int MacroAssembler::corrected_idivq(Register reg) {
  576   // Full implementation of Java ldiv and lrem; checks for special
  577   // case as described in JVM spec., p.243 & p.271.  The function
  578   // returns the (pc) offset of the idivl instruction - may be needed
  579   // for implicit exceptions.
  580   //
  581   //         normal case                           special case
  582   //
  583   // input : rax: dividend                         min_long
  584   //         reg: divisor   (may not be eax/edx)   -1
  585   //
  586   // output: rax: quotient  (= rax idiv reg)       min_long
  587   //         rdx: remainder (= rax irem reg)       0
  588   assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
  589   static const int64_t min_long = 0x8000000000000000;
  590   Label normal_case, special_case;
  591 
  592   // check for special case
  593   cmp64(rax, ExternalAddress((address) &min_long));
  594   jcc(Assembler::notEqual, normal_case);
  595   xorl(rdx, rdx); // prepare rdx for possible special case (where
  596                   // remainder = 0)
  597   cmpq(reg, -1);
  598   jcc(Assembler::equal, special_case);
  599 
  600   // handle normal case
  601   bind(normal_case);
  602   cdqq();
  603   int idivq_offset = offset();
  604   idivq(reg);
  605 
  606   // normal and special case exit
  607   bind(special_case);
  608 
  609   return idivq_offset;
  610 }
  611 
  612 void MacroAssembler::decrementq(Register reg, int value) {
  613   if (value == min_jint) { subq(reg, value); return; }
  614   if (value <  0) { incrementq(reg, -value); return; }
  615   if (value == 0) {                        ; return; }
  616   if (value == 1 && UseIncDec) { decq(reg) ; return; }
  617   /* else */      { subq(reg, value)       ; return; }
  618 }
  619 
  620 void MacroAssembler::decrementq(Address dst, int value) {
  621   if (value == min_jint) { subq(dst, value); return; }
  622   if (value <  0) { incrementq(dst, -value); return; }
  623   if (value == 0) {                        ; return; }
  624   if (value == 1 && UseIncDec) { decq(dst) ; return; }
  625   /* else */      { subq(dst, value)       ; return; }
  626 }
  627 
  628 void MacroAssembler::incrementq(AddressLiteral dst) {
  629   if (reachable(dst)) {
  630     incrementq(as_Address(dst));
  631   } else {
  632     lea(rscratch1, dst);
  633     incrementq(Address(rscratch1, 0));
  634   }
  635 }
  636 
  637 void MacroAssembler::incrementq(Register reg, int value) {
  638   if (value == min_jint) { addq(reg, value); return; }
  639   if (value <  0) { decrementq(reg, -value); return; }
  640   if (value == 0) {                        ; return; }
  641   if (value == 1 && UseIncDec) { incq(reg) ; return; }
  642   /* else */      { addq(reg, value)       ; return; }
  643 }
  644 
  645 void MacroAssembler::incrementq(Address dst, int value) {
  646   if (value == min_jint) { addq(dst, value); return; }
  647   if (value <  0) { decrementq(dst, -value); return; }
  648   if (value == 0) {                        ; return; }
  649   if (value == 1 && UseIncDec) { incq(dst) ; return; }
  650   /* else */      { addq(dst, value)       ; return; }
  651 }
  652 
  653 // 32bit can do a case table jump in one instruction but we no longer allow the base
  654 // to be installed in the Address class
  655 void MacroAssembler::jump(ArrayAddress entry) {
  656   lea(rscratch1, entry.base());
  657   Address dispatch = entry.index();
  658   assert(dispatch._base == noreg, "must be");
  659   dispatch._base = rscratch1;
  660   jmp(dispatch);
  661 }
  662 
  663 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
  664   ShouldNotReachHere(); // 64bit doesn't use two regs
  665   cmpq(x_lo, y_lo);
  666 }
  667 
  668 void MacroAssembler::lea(Register dst, AddressLiteral src) {
  669     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
  670 }
  671 
  672 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
  673   mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
  674   movptr(dst, rscratch1);
  675 }
  676 
  677 void MacroAssembler::leave() {
  678   // %%% is this really better? Why not on 32bit too?
  679   emit_int8((unsigned char)0xC9); // LEAVE
  680 }
  681 
  682 void MacroAssembler::lneg(Register hi, Register lo) {
  683   ShouldNotReachHere(); // 64bit doesn't use two regs
  684   negq(lo);
  685 }
  686 
  687 void MacroAssembler::movoop(Register dst, jobject obj) {
  688   mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
  689 }
  690 
  691 void MacroAssembler::movoop(Address dst, jobject obj) {
  692   mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
  693   movq(dst, rscratch1);
  694 }
  695 
  696 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
  697   mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
  698 }
  699 
  700 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
  701   mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
  702   movq(dst, rscratch1);
  703 }
  704 
  705 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
  706   if (src.is_lval()) {
  707     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
  708   } else {
  709     if (reachable(src)) {
  710       movq(dst, as_Address(src));
  711     } else {
  712       lea(scratch, src);
  713       movq(dst, Address(scratch, 0));
  714     }
  715   }
  716 }
  717 
  718 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
  719   movq(as_Address(dst), src);
  720 }
  721 
  722 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
  723   movq(dst, as_Address(src));
  724 }
  725 
  726 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
  727 void MacroAssembler::movptr(Address dst, intptr_t src) {
  728   mov64(rscratch1, src);
  729   movq(dst, rscratch1);
  730 }
  731 
  732 // These are mostly for initializing NULL
  733 void MacroAssembler::movptr(Address dst, int32_t src) {
  734   movslq(dst, src);
  735 }
  736 
  737 void MacroAssembler::movptr(Register dst, int32_t src) {
  738   mov64(dst, (intptr_t)src);
  739 }
  740 
  741 void MacroAssembler::pushoop(jobject obj) {
  742   movoop(rscratch1, obj);
  743   push(rscratch1);
  744 }
  745 
  746 void MacroAssembler::pushklass(Metadata* obj) {
  747   mov_metadata(rscratch1, obj);
  748   push(rscratch1);
  749 }
  750 
  751 void MacroAssembler::pushptr(AddressLiteral src) {
  752   lea(rscratch1, src);
  753   if (src.is_lval()) {
  754     push(rscratch1);
  755   } else {
  756     pushq(Address(rscratch1, 0));
  757   }
  758 }
  759 
  760 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
  761   // we must set sp to zero to clear frame
  762   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
  763   // must clear fp, so that compiled frames are not confused; it is
  764   // possible that we need it only for debugging
  765   if (clear_fp) {
  766     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
  767   }
  768 
  769   // Always clear the pc because it could have been set by make_walkable()
  770   movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
  771   vzeroupper();
  772 }
  773 
  774 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
  775                                          Register last_java_fp,
  776                                          address  last_java_pc) {
  777   vzeroupper();
  778   // determine last_java_sp register
  779   if (!last_java_sp->is_valid()) {
  780     last_java_sp = rsp;
  781   }
  782 
  783   // last_java_fp is optional
  784   if (last_java_fp->is_valid()) {
  785     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
  786            last_java_fp);
  787   }
  788 
  789   // last_java_pc is optional
  790   if (last_java_pc != NULL) {
  791     Address java_pc(r15_thread,
  792                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
  793     lea(rscratch1, InternalAddress(last_java_pc));
  794     movptr(java_pc, rscratch1);
  795   }
  796 
  797   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
  798 }
  799 
  800 static void pass_arg0(MacroAssembler* masm, Register arg) {
  801   if (c_rarg0 != arg ) {
  802     masm->mov(c_rarg0, arg);
  803   }
  804 }
  805 
  806 static void pass_arg1(MacroAssembler* masm, Register arg) {
  807   if (c_rarg1 != arg ) {
  808     masm->mov(c_rarg1, arg);
  809   }
  810 }
  811 
  812 static void pass_arg2(MacroAssembler* masm, Register arg) {
  813   if (c_rarg2 != arg ) {
  814     masm->mov(c_rarg2, arg);
  815   }
  816 }
  817 
  818 static void pass_arg3(MacroAssembler* masm, Register arg) {
  819   if (c_rarg3 != arg ) {
  820     masm->mov(c_rarg3, arg);
  821   }
  822 }
  823 
  824 void MacroAssembler::stop(const char* msg) {
  825   address rip = pc();
  826   pusha(); // get regs on stack
  827   lea(c_rarg0, ExternalAddress((address) msg));
  828   lea(c_rarg1, InternalAddress(rip));
  829   movq(c_rarg2, rsp); // pass pointer to regs array
  830   andq(rsp, -16); // align stack as required by ABI
  831   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
  832   hlt();
  833 }
  834 
  835 void MacroAssembler::warn(const char* msg) {
  836   push(rbp);
  837   movq(rbp, rsp);
  838   andq(rsp, -16);     // align stack as required by push_CPU_state and call
  839   push_CPU_state();   // keeps alignment at 16 bytes
  840   lea(c_rarg0, ExternalAddress((address) msg));
  841   lea(rax, ExternalAddress(CAST_FROM_FN_PTR(address, warning)));
  842   call(rax);
  843   pop_CPU_state();
  844   mov(rsp, rbp);
  845   pop(rbp);
  846 }
  847 
  848 void MacroAssembler::print_state() {
  849   address rip = pc();
  850   pusha();            // get regs on stack
  851   push(rbp);
  852   movq(rbp, rsp);
  853   andq(rsp, -16);     // align stack as required by push_CPU_state and call
  854   push_CPU_state();   // keeps alignment at 16 bytes
  855 
  856   lea(c_rarg0, InternalAddress(rip));
  857   lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
  858   call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
  859 
  860   pop_CPU_state();
  861   mov(rsp, rbp);
  862   pop(rbp);
  863   popa();
  864 }
  865 
  866 #ifndef PRODUCT
  867 extern "C" void findpc(intptr_t x);
  868 #endif
  869 
  870 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
  871   // In order to get locks to work, we need to fake a in_VM state
  872   if (ShowMessageBoxOnError) {
  873     JavaThread* thread = JavaThread::current();
  874     JavaThreadState saved_state = thread->thread_state();
  875     thread->set_thread_state(_thread_in_vm);
  876 #ifndef PRODUCT
  877     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
  878       ttyLocker ttyl;
  879       BytecodeCounter::print();
  880     }
  881 #endif
  882     // To see where a verify_oop failed, get $ebx+40/X for this frame.
  883     // XXX correct this offset for amd64
  884     // This is the value of eip which points to where verify_oop will return.
  885     if (os::message_box(msg, "Execution stopped, print registers?")) {
  886       print_state64(pc, regs);
  887       BREAKPOINT;
  888       assert(false, "start up GDB");
  889     }
  890     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
  891   } else {
  892     ttyLocker ttyl;
  893     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
  894                     msg);
  895     assert(false, "DEBUG MESSAGE: %s", msg);
  896   }
  897 }
  898 
  899 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
  900   ttyLocker ttyl;
  901   FlagSetting fs(Debugging, true);
  902   tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
  903 #ifndef PRODUCT
  904   tty->cr();
  905   findpc(pc);
  906   tty->cr();
  907 #endif
  908 #define PRINT_REG(rax, value) \
  909   { tty->print("%s = ", #rax); os::print_location(tty, value); }
  910   PRINT_REG(rax, regs[15]);
  911   PRINT_REG(rbx, regs[12]);
  912   PRINT_REG(rcx, regs[14]);
  913   PRINT_REG(rdx, regs[13]);
  914   PRINT_REG(rdi, regs[8]);
  915   PRINT_REG(rsi, regs[9]);
  916   PRINT_REG(rbp, regs[10]);
  917   PRINT_REG(rsp, regs[11]);
  918   PRINT_REG(r8 , regs[7]);
  919   PRINT_REG(r9 , regs[6]);
  920   PRINT_REG(r10, regs[5]);
  921   PRINT_REG(r11, regs[4]);
  922   PRINT_REG(r12, regs[3]);
  923   PRINT_REG(r13, regs[2]);
  924   PRINT_REG(r14, regs[1]);
  925   PRINT_REG(r15, regs[0]);
  926 #undef PRINT_REG
  927   // Print some words near top of staack.
  928   int64_t* rsp = (int64_t*) regs[11];
  929   int64_t* dump_sp = rsp;
  930   for (int col1 = 0; col1 < 8; col1++) {
  931     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
  932     os::print_location(tty, *dump_sp++);
  933   }
  934   for (int row = 0; row < 25; row++) {
  935     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
  936     for (int col = 0; col < 4; col++) {
  937       tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
  938     }
  939     tty->cr();
  940   }
  941   // Print some instructions around pc:
  942   Disassembler::decode((address)pc-64, (address)pc);
  943   tty->print_cr("--------");
  944   Disassembler::decode((address)pc, (address)pc+32);
  945 }
  946 
  947 #endif // _LP64
  948 
  949 // Now versions that are common to 32/64 bit
  950 
  951 void MacroAssembler::addptr(Register dst, int32_t imm32) {
  952   LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
  953 }
  954 
  955 void MacroAssembler::addptr(Register dst, Register src) {
  956   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
  957 }
  958 
  959 void MacroAssembler::addptr(Address dst, Register src) {
  960   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
  961 }
  962 
  963 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
  964   if (reachable(src)) {
  965     Assembler::addsd(dst, as_Address(src));
  966   } else {
  967     lea(rscratch1, src);
  968     Assembler::addsd(dst, Address(rscratch1, 0));
  969   }
  970 }
  971 
  972 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
  973   if (reachable(src)) {
  974     addss(dst, as_Address(src));
  975   } else {
  976     lea(rscratch1, src);
  977     addss(dst, Address(rscratch1, 0));
  978   }
  979 }
  980 
  981 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src) {
  982   if (reachable(src)) {
  983     Assembler::addpd(dst, as_Address(src));
  984   } else {
  985     lea(rscratch1, src);
  986     Assembler::addpd(dst, Address(rscratch1, 0));
  987   }
  988 }
  989 
  990 void MacroAssembler::align(int modulus) {
  991   align(modulus, offset());
  992 }
  993 
  994 void MacroAssembler::align(int modulus, int target) {
  995   if (target % modulus != 0) {
  996     nop(modulus - (target % modulus));
  997   }
  998 }
  999 
 1000 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src) {
 1001   // Used in sign-masking with aligned address.
 1002   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
 1003   if (reachable(src)) {
 1004     Assembler::andpd(dst, as_Address(src));
 1005   } else {
 1006     lea(rscratch1, src);
 1007     Assembler::andpd(dst, Address(rscratch1, 0));
 1008   }
 1009 }
 1010 
 1011 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src) {
 1012   // Used in sign-masking with aligned address.
 1013   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
 1014   if (reachable(src)) {
 1015     Assembler::andps(dst, as_Address(src));
 1016   } else {
 1017     lea(rscratch1, src);
 1018     Assembler::andps(dst, Address(rscratch1, 0));
 1019   }
 1020 }
 1021 
 1022 void MacroAssembler::andptr(Register dst, int32_t imm32) {
 1023   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
 1024 }
 1025 
 1026 void MacroAssembler::atomic_incl(Address counter_addr) {
 1027   if (os::is_MP())
 1028     lock();
 1029   incrementl(counter_addr);
 1030 }
 1031 
 1032 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
 1033   if (reachable(counter_addr)) {
 1034     atomic_incl(as_Address(counter_addr));
 1035   } else {
 1036     lea(scr, counter_addr);
 1037     atomic_incl(Address(scr, 0));
 1038   }
 1039 }
 1040 
 1041 #ifdef _LP64
 1042 void MacroAssembler::atomic_incq(Address counter_addr) {
 1043   if (os::is_MP())
 1044     lock();
 1045   incrementq(counter_addr);
 1046 }
 1047 
 1048 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
 1049   if (reachable(counter_addr)) {
 1050     atomic_incq(as_Address(counter_addr));
 1051   } else {
 1052     lea(scr, counter_addr);
 1053     atomic_incq(Address(scr, 0));
 1054   }
 1055 }
 1056 #endif
 1057 
 1058 // Writes to stack successive pages until offset reached to check for
 1059 // stack overflow + shadow pages.  This clobbers tmp.
 1060 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
 1061   movptr(tmp, rsp);
 1062   // Bang stack for total size given plus shadow page size.
 1063   // Bang one page at a time because large size can bang beyond yellow and
 1064   // red zones.
 1065   Label loop;
 1066   bind(loop);
 1067   movl(Address(tmp, (-os::vm_page_size())), size );
 1068   subptr(tmp, os::vm_page_size());
 1069   subl(size, os::vm_page_size());
 1070   jcc(Assembler::greater, loop);
 1071 
 1072   // Bang down shadow pages too.
 1073   // At this point, (tmp-0) is the last address touched, so don't
 1074   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
 1075   // was post-decremented.)  Skip this address by starting at i=1, and
 1076   // touch a few more pages below.  N.B.  It is important to touch all
 1077   // the way down including all pages in the shadow zone.
 1078   for (int i = 1; i < ((int)JavaThread::stack_shadow_zone_size() / os::vm_page_size()); i++) {
 1079     // this could be any sized move but this is can be a debugging crumb
 1080     // so the bigger the better.
 1081     movptr(Address(tmp, (-i*os::vm_page_size())), size );
 1082   }
 1083 }
 1084 
 1085 void MacroAssembler::reserved_stack_check() {
 1086     // testing if reserved zone needs to be enabled
 1087     Label no_reserved_zone_enabling;
 1088     Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
 1089     NOT_LP64(get_thread(rsi);)
 1090 
 1091     cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
 1092     jcc(Assembler::below, no_reserved_zone_enabling);
 1093 
 1094     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
 1095     jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 1096     should_not_reach_here();
 1097 
 1098     bind(no_reserved_zone_enabling);
 1099 }
 1100 
 1101 int MacroAssembler::biased_locking_enter(Register lock_reg,
 1102                                          Register obj_reg,
 1103                                          Register swap_reg,
 1104                                          Register tmp_reg,
 1105                                          bool swap_reg_contains_mark,
 1106                                          Label& done,
 1107                                          Label* slow_case,
 1108                                          BiasedLockingCounters* counters) {
 1109   assert(UseBiasedLocking, "why call this otherwise?");
 1110   assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
 1111   assert(tmp_reg != noreg, "tmp_reg must be supplied");
 1112   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
 1113   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 1114   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 1115   NOT_LP64( Address saved_mark_addr(lock_reg, 0); )
 1116 
 1117   if (PrintBiasedLockingStatistics && counters == NULL) {
 1118     counters = BiasedLocking::counters();
 1119   }
 1120   // Biased locking
 1121   // See whether the lock is currently biased toward our thread and
 1122   // whether the epoch is still valid
 1123   // Note that the runtime guarantees sufficient alignment of JavaThread
 1124   // pointers to allow age to be placed into low bits
 1125   // First check to see whether biasing is even enabled for this object
 1126   Label cas_label;
 1127   int null_check_offset = -1;
 1128   if (!swap_reg_contains_mark) {
 1129     null_check_offset = offset();
 1130     movptr(swap_reg, mark_addr);
 1131   }
 1132   movptr(tmp_reg, swap_reg);
 1133   andptr(tmp_reg, markOopDesc::biased_lock_mask_in_place);
 1134   cmpptr(tmp_reg, markOopDesc::biased_lock_pattern);
 1135   jcc(Assembler::notEqual, cas_label);
 1136   // The bias pattern is present in the object's header. Need to check
 1137   // whether the bias owner and the epoch are both still current.
 1138 #ifndef _LP64
 1139   // Note that because there is no current thread register on x86_32 we
 1140   // need to store off the mark word we read out of the object to
 1141   // avoid reloading it and needing to recheck invariants below. This
 1142   // store is unfortunate but it makes the overall code shorter and
 1143   // simpler.
 1144   movptr(saved_mark_addr, swap_reg);
 1145 #endif
 1146   if (swap_reg_contains_mark) {
 1147     null_check_offset = offset();
 1148   }
 1149   load_prototype_header(tmp_reg, obj_reg);
 1150 #ifdef _LP64
 1151   orptr(tmp_reg, r15_thread);
 1152   xorptr(tmp_reg, swap_reg);
 1153   Register header_reg = tmp_reg;
 1154 #else
 1155   xorptr(tmp_reg, swap_reg);
 1156   get_thread(swap_reg);
 1157   xorptr(swap_reg, tmp_reg);
 1158   Register header_reg = swap_reg;
 1159 #endif
 1160   andptr(header_reg, ~((int) markOopDesc::age_mask_in_place));
 1161   if (counters != NULL) {
 1162     cond_inc32(Assembler::zero,
 1163                ExternalAddress((address) counters->biased_lock_entry_count_addr()));
 1164   }
 1165   jcc(Assembler::equal, done);
 1166 
 1167   Label try_revoke_bias;
 1168   Label try_rebias;
 1169 
 1170   // At this point we know that the header has the bias pattern and
 1171   // that we are not the bias owner in the current epoch. We need to
 1172   // figure out more details about the state of the header in order to
 1173   // know what operations can be legally performed on the object's
 1174   // header.
 1175 
 1176   // If the low three bits in the xor result aren't clear, that means
 1177   // the prototype header is no longer biased and we have to revoke
 1178   // the bias on this object.
 1179   testptr(header_reg, markOopDesc::biased_lock_mask_in_place);
 1180   jccb(Assembler::notZero, try_revoke_bias);
 1181 
 1182   // Biasing is still enabled for this data type. See whether the
 1183   // epoch of the current bias is still valid, meaning that the epoch
 1184   // bits of the mark word are equal to the epoch bits of the
 1185   // prototype header. (Note that the prototype header's epoch bits
 1186   // only change at a safepoint.) If not, attempt to rebias the object
 1187   // toward the current thread. Note that we must be absolutely sure
 1188   // that the current epoch is invalid in order to do this because
 1189   // otherwise the manipulations it performs on the mark word are
 1190   // illegal.
 1191   testptr(header_reg, markOopDesc::epoch_mask_in_place);
 1192   jccb(Assembler::notZero, try_rebias);
 1193 
 1194   // The epoch of the current bias is still valid but we know nothing
 1195   // about the owner; it might be set or it might be clear. Try to
 1196   // acquire the bias of the object using an atomic operation. If this
 1197   // fails we will go in to the runtime to revoke the object's bias.
 1198   // Note that we first construct the presumed unbiased header so we
 1199   // don't accidentally blow away another thread's valid bias.
 1200   NOT_LP64( movptr(swap_reg, saved_mark_addr); )
 1201   andptr(swap_reg,
 1202          markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 1203 #ifdef _LP64
 1204   movptr(tmp_reg, swap_reg);
 1205   orptr(tmp_reg, r15_thread);
 1206 #else
 1207   get_thread(tmp_reg);
 1208   orptr(tmp_reg, swap_reg);
 1209 #endif
 1210   if (os::is_MP()) {
 1211     lock();
 1212   }
 1213   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
 1214   // If the biasing toward our thread failed, this means that
 1215   // another thread succeeded in biasing it toward itself and we
 1216   // need to revoke that bias. The revocation will occur in the
 1217   // interpreter runtime in the slow case.
 1218   if (counters != NULL) {
 1219     cond_inc32(Assembler::zero,
 1220                ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
 1221   }
 1222   if (slow_case != NULL) {
 1223     jcc(Assembler::notZero, *slow_case);
 1224   }
 1225   jmp(done);
 1226 
 1227   bind(try_rebias);
 1228   // At this point we know the epoch has expired, meaning that the
 1229   // current "bias owner", if any, is actually invalid. Under these
 1230   // circumstances _only_, we are allowed to use the current header's
 1231   // value as the comparison value when doing the cas to acquire the
 1232   // bias in the current epoch. In other words, we allow transfer of
 1233   // the bias from one thread to another directly in this situation.
 1234   //
 1235   // FIXME: due to a lack of registers we currently blow away the age
 1236   // bits in this situation. Should attempt to preserve them.
 1237   load_prototype_header(tmp_reg, obj_reg);
 1238 #ifdef _LP64
 1239   orptr(tmp_reg, r15_thread);
 1240 #else
 1241   get_thread(swap_reg);
 1242   orptr(tmp_reg, swap_reg);
 1243   movptr(swap_reg, saved_mark_addr);
 1244 #endif
 1245   if (os::is_MP()) {
 1246     lock();
 1247   }
 1248   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
 1249   // If the biasing toward our thread failed, then another thread
 1250   // succeeded in biasing it toward itself and we need to revoke that
 1251   // bias. The revocation will occur in the runtime in the slow case.
 1252   if (counters != NULL) {
 1253     cond_inc32(Assembler::zero,
 1254                ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
 1255   }
 1256   if (slow_case != NULL) {
 1257     jcc(Assembler::notZero, *slow_case);
 1258   }
 1259   jmp(done);
 1260 
 1261   bind(try_revoke_bias);
 1262   // The prototype mark in the klass doesn't have the bias bit set any
 1263   // more, indicating that objects of this data type are not supposed
 1264   // to be biased any more. We are going to try to reset the mark of
 1265   // this object to the prototype value and fall through to the
 1266   // CAS-based locking scheme. Note that if our CAS fails, it means
 1267   // that another thread raced us for the privilege of revoking the
 1268   // bias of this particular object, so it's okay to continue in the
 1269   // normal locking code.
 1270   //
 1271   // FIXME: due to a lack of registers we currently blow away the age
 1272   // bits in this situation. Should attempt to preserve them.
 1273   NOT_LP64( movptr(swap_reg, saved_mark_addr); )
 1274   load_prototype_header(tmp_reg, obj_reg);
 1275   if (os::is_MP()) {
 1276     lock();
 1277   }
 1278   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
 1279   // Fall through to the normal CAS-based lock, because no matter what
 1280   // the result of the above CAS, some thread must have succeeded in
 1281   // removing the bias bit from the object's header.
 1282   if (counters != NULL) {
 1283     cond_inc32(Assembler::zero,
 1284                ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
 1285   }
 1286 
 1287   bind(cas_label);
 1288 
 1289   return null_check_offset;
 1290 }
 1291 
 1292 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 1293   assert(UseBiasedLocking, "why call this otherwise?");
 1294 
 1295   // Check for biased locking unlock case, which is a no-op
 1296   // Note: we do not have to check the thread ID for two reasons.
 1297   // First, the interpreter checks for IllegalMonitorStateException at
 1298   // a higher level. Second, if the bias was revoked while we held the
 1299   // lock, the object could not be rebiased toward another thread, so
 1300   // the bias bit would be clear.
 1301   movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 1302   andptr(temp_reg, markOopDesc::biased_lock_mask_in_place);
 1303   cmpptr(temp_reg, markOopDesc::biased_lock_pattern);
 1304   jcc(Assembler::equal, done);
 1305 }
 1306 
 1307 #ifdef COMPILER2
 1308 
 1309 #if INCLUDE_RTM_OPT
 1310 
 1311 // Update rtm_counters based on abort status
 1312 // input: abort_status
 1313 //        rtm_counters (RTMLockingCounters*)
 1314 // flags are killed
 1315 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
 1316 
 1317   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
 1318   if (PrintPreciseRTMLockingStatistics) {
 1319     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
 1320       Label check_abort;
 1321       testl(abort_status, (1<<i));
 1322       jccb(Assembler::equal, check_abort);
 1323       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
 1324       bind(check_abort);
 1325     }
 1326   }
 1327 }
 1328 
 1329 // Branch if (random & (count-1) != 0), count is 2^n
 1330 // tmp, scr and flags are killed
 1331 void MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
 1332   assert(tmp == rax, "");
 1333   assert(scr == rdx, "");
 1334   rdtsc(); // modifies EDX:EAX
 1335   andptr(tmp, count-1);
 1336   jccb(Assembler::notZero, brLabel);
 1337 }
 1338 
 1339 // Perform abort ratio calculation, set no_rtm bit if high ratio
 1340 // input:  rtm_counters_Reg (RTMLockingCounters* address)
 1341 // tmpReg, rtm_counters_Reg and flags are killed
 1342 void MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
 1343                                                  Register rtm_counters_Reg,
 1344                                                  RTMLockingCounters* rtm_counters,
 1345                                                  Metadata* method_data) {
 1346   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
 1347 
 1348   if (RTMLockingCalculationDelay > 0) {
 1349     // Delay calculation
 1350     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
 1351     testptr(tmpReg, tmpReg);
 1352     jccb(Assembler::equal, L_done);
 1353   }
 1354   // Abort ratio calculation only if abort_count > RTMAbortThreshold
 1355   //   Aborted transactions = abort_count * 100
 1356   //   All transactions = total_count *  RTMTotalCountIncrRate
 1357   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
 1358 
 1359   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
 1360   cmpptr(tmpReg, RTMAbortThreshold);
 1361   jccb(Assembler::below, L_check_always_rtm2);
 1362   imulptr(tmpReg, tmpReg, 100);
 1363 
 1364   Register scrReg = rtm_counters_Reg;
 1365   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 1366   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
 1367   imulptr(scrReg, scrReg, RTMAbortRatio);
 1368   cmpptr(tmpReg, scrReg);
 1369   jccb(Assembler::below, L_check_always_rtm1);
 1370   if (method_data != NULL) {
 1371     // set rtm_state to "no rtm" in MDO
 1372     mov_metadata(tmpReg, method_data);
 1373     if (os::is_MP()) {
 1374       lock();
 1375     }
 1376     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
 1377   }
 1378   jmpb(L_done);
 1379   bind(L_check_always_rtm1);
 1380   // Reload RTMLockingCounters* address
 1381   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 1382   bind(L_check_always_rtm2);
 1383   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
 1384   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
 1385   jccb(Assembler::below, L_done);
 1386   if (method_data != NULL) {
 1387     // set rtm_state to "always rtm" in MDO
 1388     mov_metadata(tmpReg, method_data);
 1389     if (os::is_MP()) {
 1390       lock();
 1391     }
 1392     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
 1393   }
 1394   bind(L_done);
 1395 }
 1396 
 1397 // Update counters and perform abort ratio calculation
 1398 // input:  abort_status_Reg
 1399 // rtm_counters_Reg, flags are killed
 1400 void MacroAssembler::rtm_profiling(Register abort_status_Reg,
 1401                                    Register rtm_counters_Reg,
 1402                                    RTMLockingCounters* rtm_counters,
 1403                                    Metadata* method_data,
 1404                                    bool profile_rtm) {
 1405 
 1406   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 1407   // update rtm counters based on rax value at abort
 1408   // reads abort_status_Reg, updates flags
 1409   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
 1410   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
 1411   if (profile_rtm) {
 1412     // Save abort status because abort_status_Reg is used by following code.
 1413     if (RTMRetryCount > 0) {
 1414       push(abort_status_Reg);
 1415     }
 1416     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 1417     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
 1418     // restore abort status
 1419     if (RTMRetryCount > 0) {
 1420       pop(abort_status_Reg);
 1421     }
 1422   }
 1423 }
 1424 
 1425 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
 1426 // inputs: retry_count_Reg
 1427 //       : abort_status_Reg
 1428 // output: retry_count_Reg decremented by 1
 1429 // flags are killed
 1430 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
 1431   Label doneRetry;
 1432   assert(abort_status_Reg == rax, "");
 1433   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
 1434   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
 1435   // if reason is in 0x6 and retry count != 0 then retry
 1436   andptr(abort_status_Reg, 0x6);
 1437   jccb(Assembler::zero, doneRetry);
 1438   testl(retry_count_Reg, retry_count_Reg);
 1439   jccb(Assembler::zero, doneRetry);
 1440   pause();
 1441   decrementl(retry_count_Reg);
 1442   jmp(retryLabel);
 1443   bind(doneRetry);
 1444 }
 1445 
 1446 // Spin and retry if lock is busy,
 1447 // inputs: box_Reg (monitor address)
 1448 //       : retry_count_Reg
 1449 // output: retry_count_Reg decremented by 1
 1450 //       : clear z flag if retry count exceeded
 1451 // tmp_Reg, scr_Reg, flags are killed
 1452 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
 1453                                             Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
 1454   Label SpinLoop, SpinExit, doneRetry;
 1455   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 1456 
 1457   testl(retry_count_Reg, retry_count_Reg);
 1458   jccb(Assembler::zero, doneRetry);
 1459   decrementl(retry_count_Reg);
 1460   movptr(scr_Reg, RTMSpinLoopCount);
 1461 
 1462   bind(SpinLoop);
 1463   pause();
 1464   decrementl(scr_Reg);
 1465   jccb(Assembler::lessEqual, SpinExit);
 1466   movptr(tmp_Reg, Address(box_Reg, owner_offset));
 1467   testptr(tmp_Reg, tmp_Reg);
 1468   jccb(Assembler::notZero, SpinLoop);
 1469 
 1470   bind(SpinExit);
 1471   jmp(retryLabel);
 1472   bind(doneRetry);
 1473   incrementl(retry_count_Reg); // clear z flag
 1474 }
 1475 
 1476 // Use RTM for normal stack locks
 1477 // Input: objReg (object to lock)
 1478 void MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
 1479                                        Register retry_on_abort_count_Reg,
 1480                                        RTMLockingCounters* stack_rtm_counters,
 1481                                        Metadata* method_data, bool profile_rtm,
 1482                                        Label& DONE_LABEL, Label& IsInflated) {
 1483   assert(UseRTMForStackLocks, "why call this otherwise?");
 1484   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
 1485   assert(tmpReg == rax, "");
 1486   assert(scrReg == rdx, "");
 1487   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 1488 
 1489   if (RTMRetryCount > 0) {
 1490     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 1491     bind(L_rtm_retry);
 1492   }
 1493   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 1494   testptr(tmpReg, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
 1495   jcc(Assembler::notZero, IsInflated);
 1496 
 1497   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 1498     Label L_noincrement;
 1499     if (RTMTotalCountIncrRate > 1) {
 1500       // tmpReg, scrReg and flags are killed
 1501       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 1502     }
 1503     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
 1504     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
 1505     bind(L_noincrement);
 1506   }
 1507   xbegin(L_on_abort);
 1508   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
 1509   andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
 1510   cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
 1511   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
 1512 
 1513   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 1514   if (UseRTMXendForLockBusy) {
 1515     xend();
 1516     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
 1517     jmp(L_decrement_retry);
 1518   }
 1519   else {
 1520     xabort(0);
 1521   }
 1522   bind(L_on_abort);
 1523   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 1524     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
 1525   }
 1526   bind(L_decrement_retry);
 1527   if (RTMRetryCount > 0) {
 1528     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 1529     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 1530   }
 1531 }
 1532 
 1533 // Use RTM for inflating locks
 1534 // inputs: objReg (object to lock)
 1535 //         boxReg (on-stack box address (displaced header location) - KILLED)
 1536 //         tmpReg (ObjectMonitor address + markOopDesc::monitor_value)
 1537 void MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
 1538                                           Register scrReg, Register retry_on_busy_count_Reg,
 1539                                           Register retry_on_abort_count_Reg,
 1540                                           RTMLockingCounters* rtm_counters,
 1541                                           Metadata* method_data, bool profile_rtm,
 1542                                           Label& DONE_LABEL) {
 1543   assert(UseRTMLocking, "why call this otherwise?");
 1544   assert(tmpReg == rax, "");
 1545   assert(scrReg == rdx, "");
 1546   Label L_rtm_retry, L_decrement_retry, L_on_abort;
 1547   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 1548 
 1549   // Without cast to int32_t a movptr will destroy r10 which is typically obj
 1550   movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
 1551   movptr(boxReg, tmpReg); // Save ObjectMonitor address
 1552 
 1553   if (RTMRetryCount > 0) {
 1554     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
 1555     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
 1556     bind(L_rtm_retry);
 1557   }
 1558   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 1559     Label L_noincrement;
 1560     if (RTMTotalCountIncrRate > 1) {
 1561       // tmpReg, scrReg and flags are killed
 1562       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
 1563     }
 1564     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
 1565     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
 1566     bind(L_noincrement);
 1567   }
 1568   xbegin(L_on_abort);
 1569   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
 1570   movptr(tmpReg, Address(tmpReg, owner_offset));
 1571   testptr(tmpReg, tmpReg);
 1572   jcc(Assembler::zero, DONE_LABEL);
 1573   if (UseRTMXendForLockBusy) {
 1574     xend();
 1575     jmp(L_decrement_retry);
 1576   }
 1577   else {
 1578     xabort(0);
 1579   }
 1580   bind(L_on_abort);
 1581   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
 1582   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
 1583     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
 1584   }
 1585   if (RTMRetryCount > 0) {
 1586     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
 1587     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
 1588   }
 1589 
 1590   movptr(tmpReg, Address(boxReg, owner_offset)) ;
 1591   testptr(tmpReg, tmpReg) ;
 1592   jccb(Assembler::notZero, L_decrement_retry) ;
 1593 
 1594   // Appears unlocked - try to swing _owner from null to non-null.
 1595   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 1596 #ifdef _LP64
 1597   Register threadReg = r15_thread;
 1598 #else
 1599   get_thread(scrReg);
 1600   Register threadReg = scrReg;
 1601 #endif
 1602   if (os::is_MP()) {
 1603     lock();
 1604   }
 1605   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
 1606 
 1607   if (RTMRetryCount > 0) {
 1608     // success done else retry
 1609     jccb(Assembler::equal, DONE_LABEL) ;
 1610     bind(L_decrement_retry);
 1611     // Spin and retry if lock is busy.
 1612     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
 1613   }
 1614   else {
 1615     bind(L_decrement_retry);
 1616   }
 1617 }
 1618 
 1619 #endif //  INCLUDE_RTM_OPT
 1620 
 1621 // Fast_Lock and Fast_Unlock used by C2
 1622 
 1623 // Because the transitions from emitted code to the runtime
 1624 // monitorenter/exit helper stubs are so slow it's critical that
 1625 // we inline both the stack-locking fast-path and the inflated fast path.
 1626 //
 1627 // See also: cmpFastLock and cmpFastUnlock.
 1628 //
 1629 // What follows is a specialized inline transliteration of the code
 1630 // in slow_enter() and slow_exit().  If we're concerned about I$ bloat
 1631 // another option would be to emit TrySlowEnter and TrySlowExit methods
 1632 // at startup-time.  These methods would accept arguments as
 1633 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
 1634 // indications in the icc.ZFlag.  Fast_Lock and Fast_Unlock would simply
 1635 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
 1636 // In practice, however, the # of lock sites is bounded and is usually small.
 1637 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
 1638 // if the processor uses simple bimodal branch predictors keyed by EIP
 1639 // Since the helper routines would be called from multiple synchronization
 1640 // sites.
 1641 //
 1642 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
 1643 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
 1644 // to those specialized methods.  That'd give us a mostly platform-independent
 1645 // implementation that the JITs could optimize and inline at their pleasure.
 1646 // Done correctly, the only time we'd need to cross to native could would be
 1647 // to park() or unpark() threads.  We'd also need a few more unsafe operators
 1648 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
 1649 // (b) explicit barriers or fence operations.
 1650 //
 1651 // TODO:
 1652 //
 1653 // *  Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
 1654 //    This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
 1655 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
 1656 //    the lock operators would typically be faster than reifying Self.
 1657 //
 1658 // *  Ideally I'd define the primitives as:
 1659 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
 1660 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
 1661 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
 1662 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
 1663 //    Furthermore the register assignments are overconstrained, possibly resulting in
 1664 //    sub-optimal code near the synchronization site.
 1665 //
 1666 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
 1667 //    Alternately, use a better sp-proximity test.
 1668 //
 1669 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
 1670 //    Either one is sufficient to uniquely identify a thread.
 1671 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
 1672 //
 1673 // *  Intrinsify notify() and notifyAll() for the common cases where the
 1674 //    object is locked by the calling thread but the waitlist is empty.
 1675 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
 1676 //
 1677 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
 1678 //    But beware of excessive branch density on AMD Opterons.
 1679 //
 1680 // *  Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
 1681 //    or failure of the fast-path.  If the fast-path fails then we pass
 1682 //    control to the slow-path, typically in C.  In Fast_Lock and
 1683 //    Fast_Unlock we often branch to DONE_LABEL, just to find that C2
 1684 //    will emit a conditional branch immediately after the node.
 1685 //    So we have branches to branches and lots of ICC.ZF games.
 1686 //    Instead, it might be better to have C2 pass a "FailureLabel"
 1687 //    into Fast_Lock and Fast_Unlock.  In the case of success, control
 1688 //    will drop through the node.  ICC.ZF is undefined at exit.
 1689 //    In the case of failure, the node will branch directly to the
 1690 //    FailureLabel
 1691 
 1692 
 1693 // obj: object to lock
 1694 // box: on-stack box address (displaced header location) - KILLED
 1695 // rax,: tmp -- KILLED
 1696 // scr: tmp -- KILLED
 1697 void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
 1698                                Register scrReg, Register cx1Reg, Register cx2Reg,
 1699                                BiasedLockingCounters* counters,
 1700                                RTMLockingCounters* rtm_counters,
 1701                                RTMLockingCounters* stack_rtm_counters,
 1702                                Metadata* method_data,
 1703                                bool use_rtm, bool profile_rtm) {
 1704   // Ensure the register assignments are disjoint
 1705   assert(tmpReg == rax, "");
 1706 
 1707   if (use_rtm) {
 1708     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
 1709   } else {
 1710     assert(cx1Reg == noreg, "");
 1711     assert(cx2Reg == noreg, "");
 1712     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
 1713   }
 1714 
 1715   if (counters != NULL) {
 1716     atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
 1717   }
 1718   if (EmitSync & 1) {
 1719       // set box->dhw = markOopDesc::unused_mark()
 1720       // Force all sync thru slow-path: slow_enter() and slow_exit()
 1721       movptr (Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
 1722       cmpptr (rsp, (int32_t)NULL_WORD);
 1723   } else {
 1724     // Possible cases that we'll encounter in fast_lock
 1725     // ------------------------------------------------
 1726     // * Inflated
 1727     //    -- unlocked
 1728     //    -- Locked
 1729     //       = by self
 1730     //       = by other
 1731     // * biased
 1732     //    -- by Self
 1733     //    -- by other
 1734     // * neutral
 1735     // * stack-locked
 1736     //    -- by self
 1737     //       = sp-proximity test hits
 1738     //       = sp-proximity test generates false-negative
 1739     //    -- by other
 1740     //
 1741 
 1742     Label IsInflated, DONE_LABEL;
 1743 
 1744     // it's stack-locked, biased or neutral
 1745     // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
 1746     // order to reduce the number of conditional branches in the most common cases.
 1747     // Beware -- there's a subtle invariant that fetch of the markword
 1748     // at [FETCH], below, will never observe a biased encoding (*101b).
 1749     // If this invariant is not held we risk exclusion (safety) failure.
 1750     if (UseBiasedLocking && !UseOptoBiasInlining) {
 1751       biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
 1752     }
 1753 
 1754 #if INCLUDE_RTM_OPT
 1755     if (UseRTMForStackLocks && use_rtm) {
 1756       rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
 1757                         stack_rtm_counters, method_data, profile_rtm,
 1758                         DONE_LABEL, IsInflated);
 1759     }
 1760 #endif // INCLUDE_RTM_OPT
 1761 
 1762     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
 1763     testptr(tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased
 1764     jccb(Assembler::notZero, IsInflated);
 1765 
 1766     // Attempt stack-locking ...
 1767     orptr (tmpReg, markOopDesc::unlocked_value);
 1768     movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
 1769     if (os::is_MP()) {
 1770       lock();
 1771     }
 1772     cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
 1773     if (counters != NULL) {
 1774       cond_inc32(Assembler::equal,
 1775                  ExternalAddress((address)counters->fast_path_entry_count_addr()));
 1776     }
 1777     jcc(Assembler::equal, DONE_LABEL);           // Success
 1778 
 1779     // Recursive locking.
 1780     // The object is stack-locked: markword contains stack pointer to BasicLock.
 1781     // Locked by current thread if difference with current SP is less than one page.
 1782     subptr(tmpReg, rsp);
 1783     // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
 1784     andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
 1785     movptr(Address(boxReg, 0), tmpReg);
 1786     if (counters != NULL) {
 1787       cond_inc32(Assembler::equal,
 1788                  ExternalAddress((address)counters->fast_path_entry_count_addr()));
 1789     }
 1790     jmp(DONE_LABEL);
 1791 
 1792     bind(IsInflated);
 1793     // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markOopDesc::monitor_value
 1794 
 1795 #if INCLUDE_RTM_OPT
 1796     // Use the same RTM locking code in 32- and 64-bit VM.
 1797     if (use_rtm) {
 1798       rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
 1799                            rtm_counters, method_data, profile_rtm, DONE_LABEL);
 1800     } else {
 1801 #endif // INCLUDE_RTM_OPT
 1802 
 1803 #ifndef _LP64
 1804     // The object is inflated.
 1805 
 1806     // boxReg refers to the on-stack BasicLock in the current frame.
 1807     // We'd like to write:
 1808     //   set box->_displaced_header = markOopDesc::unused_mark().  Any non-0 value suffices.
 1809     // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
 1810     // additional latency as we have another ST in the store buffer that must drain.
 1811 
 1812     if (EmitSync & 8192) {
 1813        movptr(Address(boxReg, 0), 3);            // results in ST-before-CAS penalty
 1814        get_thread (scrReg);
 1815        movptr(boxReg, tmpReg);                    // consider: LEA box, [tmp-2]
 1816        movptr(tmpReg, NULL_WORD);                 // consider: xor vs mov
 1817        if (os::is_MP()) {
 1818          lock();
 1819        }
 1820        cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 1821     } else
 1822     if ((EmitSync & 128) == 0) {                      // avoid ST-before-CAS
 1823        // register juggle because we need tmpReg for cmpxchgptr below
 1824        movptr(scrReg, boxReg);
 1825        movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
 1826 
 1827        // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
 1828        if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
 1829           // prefetchw [eax + Offset(_owner)-2]
 1830           prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 1831        }
 1832 
 1833        if ((EmitSync & 64) == 0) {
 1834          // Optimistic form: consider XORL tmpReg,tmpReg
 1835          movptr(tmpReg, NULL_WORD);
 1836        } else {
 1837          // Can suffer RTS->RTO upgrades on shared or cold $ lines
 1838          // Test-And-CAS instead of CAS
 1839          movptr(tmpReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));   // rax, = m->_owner
 1840          testptr(tmpReg, tmpReg);                   // Locked ?
 1841          jccb  (Assembler::notZero, DONE_LABEL);
 1842        }
 1843 
 1844        // Appears unlocked - try to swing _owner from null to non-null.
 1845        // Ideally, I'd manifest "Self" with get_thread and then attempt
 1846        // to CAS the register containing Self into m->Owner.
 1847        // But we don't have enough registers, so instead we can either try to CAS
 1848        // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
 1849        // we later store "Self" into m->Owner.  Transiently storing a stack address
 1850        // (rsp or the address of the box) into  m->owner is harmless.
 1851        // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 1852        if (os::is_MP()) {
 1853          lock();
 1854        }
 1855        cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 1856        movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
 1857        // If we weren't able to swing _owner from NULL to the BasicLock
 1858        // then take the slow path.
 1859        jccb  (Assembler::notZero, DONE_LABEL);
 1860        // update _owner from BasicLock to thread
 1861        get_thread (scrReg);                    // beware: clobbers ICCs
 1862        movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
 1863        xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
 1864 
 1865        // If the CAS fails we can either retry or pass control to the slow-path.
 1866        // We use the latter tactic.
 1867        // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 1868        // If the CAS was successful ...
 1869        //   Self has acquired the lock
 1870        //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 1871        // Intentional fall-through into DONE_LABEL ...
 1872     } else {
 1873        movptr(Address(boxReg, 0), intptr_t(markOopDesc::unused_mark()));  // results in ST-before-CAS penalty
 1874        movptr(boxReg, tmpReg);
 1875 
 1876        // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
 1877        if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
 1878           // prefetchw [eax + Offset(_owner)-2]
 1879           prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 1880        }
 1881 
 1882        if ((EmitSync & 64) == 0) {
 1883          // Optimistic form
 1884          xorptr  (tmpReg, tmpReg);
 1885        } else {
 1886          // Can suffer RTS->RTO upgrades on shared or cold $ lines
 1887          movptr(tmpReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));   // rax, = m->_owner
 1888          testptr(tmpReg, tmpReg);                   // Locked ?
 1889          jccb  (Assembler::notZero, DONE_LABEL);
 1890        }
 1891 
 1892        // Appears unlocked - try to swing _owner from null to non-null.
 1893        // Use either "Self" (in scr) or rsp as thread identity in _owner.
 1894        // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
 1895        get_thread (scrReg);
 1896        if (os::is_MP()) {
 1897          lock();
 1898        }
 1899        cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 1900 
 1901        // If the CAS fails we can either retry or pass control to the slow-path.
 1902        // We use the latter tactic.
 1903        // Pass the CAS result in the icc.ZFlag into DONE_LABEL
 1904        // If the CAS was successful ...
 1905        //   Self has acquired the lock
 1906        //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
 1907        // Intentional fall-through into DONE_LABEL ...
 1908     }
 1909 #else // _LP64
 1910     // It's inflated
 1911     movq(scrReg, tmpReg);
 1912     xorq(tmpReg, tmpReg);
 1913 
 1914     if (os::is_MP()) {
 1915       lock();
 1916     }
 1917     cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 1918     // Unconditionally set box->_displaced_header = markOopDesc::unused_mark().
 1919     // Without cast to int32_t movptr will destroy r10 which is typically obj.
 1920     movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
 1921     // Intentional fall-through into DONE_LABEL ...
 1922     // Propagate ICC.ZF from CAS above into DONE_LABEL.
 1923 #endif // _LP64
 1924 #if INCLUDE_RTM_OPT
 1925     } // use_rtm()
 1926 #endif
 1927     // DONE_LABEL is a hot target - we'd really like to place it at the
 1928     // start of cache line by padding with NOPs.
 1929     // See the AMD and Intel software optimization manuals for the
 1930     // most efficient "long" NOP encodings.
 1931     // Unfortunately none of our alignment mechanisms suffice.
 1932     bind(DONE_LABEL);
 1933 
 1934     // At DONE_LABEL the icc ZFlag is set as follows ...
 1935     // Fast_Unlock uses the same protocol.
 1936     // ZFlag == 1 -> Success
 1937     // ZFlag == 0 -> Failure - force control through the slow-path
 1938   }
 1939 }
 1940 
 1941 // obj: object to unlock
 1942 // box: box address (displaced header location), killed.  Must be EAX.
 1943 // tmp: killed, cannot be obj nor box.
 1944 //
 1945 // Some commentary on balanced locking:
 1946 //
 1947 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
 1948 // Methods that don't have provably balanced locking are forced to run in the
 1949 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
 1950 // The interpreter provides two properties:
 1951 // I1:  At return-time the interpreter automatically and quietly unlocks any
 1952 //      objects acquired the current activation (frame).  Recall that the
 1953 //      interpreter maintains an on-stack list of locks currently held by
 1954 //      a frame.
 1955 // I2:  If a method attempts to unlock an object that is not held by the
 1956 //      the frame the interpreter throws IMSX.
 1957 //
 1958 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
 1959 // B() doesn't have provably balanced locking so it runs in the interpreter.
 1960 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
 1961 // is still locked by A().
 1962 //
 1963 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
 1964 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
 1965 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
 1966 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
 1967 // Arguably given that the spec legislates the JNI case as undefined our implementation
 1968 // could reasonably *avoid* checking owner in Fast_Unlock().
 1969 // In the interest of performance we elide m->Owner==Self check in unlock.
 1970 // A perfectly viable alternative is to elide the owner check except when
 1971 // Xcheck:jni is enabled.
 1972 
 1973 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
 1974   assert(boxReg == rax, "");
 1975   assert_different_registers(objReg, boxReg, tmpReg);
 1976 
 1977   if (EmitSync & 4) {
 1978     // Disable - inhibit all inlining.  Force control through the slow-path
 1979     cmpptr (rsp, 0);
 1980   } else {
 1981     Label DONE_LABEL, Stacked, CheckSucc;
 1982 
 1983     // Critically, the biased locking test must have precedence over
 1984     // and appear before the (box->dhw == 0) recursive stack-lock test.
 1985     if (UseBiasedLocking && !UseOptoBiasInlining) {
 1986        biased_locking_exit(objReg, tmpReg, DONE_LABEL);
 1987     }
 1988 
 1989 #if INCLUDE_RTM_OPT
 1990     if (UseRTMForStackLocks && use_rtm) {
 1991       assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
 1992       Label L_regular_unlock;
 1993       movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));           // fetch markword
 1994       andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
 1995       cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
 1996       jccb(Assembler::notEqual, L_regular_unlock);  // if !HLE RegularLock
 1997       xend();                                       // otherwise end...
 1998       jmp(DONE_LABEL);                              // ... and we're done
 1999       bind(L_regular_unlock);
 2000     }
 2001 #endif
 2002 
 2003     cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
 2004     jcc   (Assembler::zero, DONE_LABEL);            // 0 indicates recursive stack-lock
 2005     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));             // Examine the object's markword
 2006     testptr(tmpReg, markOopDesc::monitor_value);    // Inflated?
 2007     jccb  (Assembler::zero, Stacked);
 2008 
 2009     // It's inflated.
 2010 #if INCLUDE_RTM_OPT
 2011     if (use_rtm) {
 2012       Label L_regular_inflated_unlock;
 2013       int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
 2014       movptr(boxReg, Address(tmpReg, owner_offset));
 2015       testptr(boxReg, boxReg);
 2016       jccb(Assembler::notZero, L_regular_inflated_unlock);
 2017       xend();
 2018       jmpb(DONE_LABEL);
 2019       bind(L_regular_inflated_unlock);
 2020     }
 2021 #endif
 2022 
 2023     // Despite our balanced locking property we still check that m->_owner == Self
 2024     // as java routines or native JNI code called by this thread might
 2025     // have released the lock.
 2026     // Refer to the comments in synchronizer.cpp for how we might encode extra
 2027     // state in _succ so we can avoid fetching EntryList|cxq.
 2028     //
 2029     // I'd like to add more cases in fast_lock() and fast_unlock() --
 2030     // such as recursive enter and exit -- but we have to be wary of
 2031     // I$ bloat, T$ effects and BP$ effects.
 2032     //
 2033     // If there's no contention try a 1-0 exit.  That is, exit without
 2034     // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
 2035     // we detect and recover from the race that the 1-0 exit admits.
 2036     //
 2037     // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
 2038     // before it STs null into _owner, releasing the lock.  Updates
 2039     // to data protected by the critical section must be visible before
 2040     // we drop the lock (and thus before any other thread could acquire
 2041     // the lock and observe the fields protected by the lock).
 2042     // IA32's memory-model is SPO, so STs are ordered with respect to
 2043     // each other and there's no need for an explicit barrier (fence).
 2044     // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
 2045 #ifndef _LP64
 2046     get_thread (boxReg);
 2047     if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
 2048       // prefetchw [ebx + Offset(_owner)-2]
 2049       prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 2050     }
 2051 
 2052     // Note that we could employ various encoding schemes to reduce
 2053     // the number of loads below (currently 4) to just 2 or 3.
 2054     // Refer to the comments in synchronizer.cpp.
 2055     // In practice the chain of fetches doesn't seem to impact performance, however.
 2056     xorptr(boxReg, boxReg);
 2057     if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
 2058        // Attempt to reduce branch density - AMD's branch predictor.
 2059        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 2060        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 2061        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 2062        jccb  (Assembler::notZero, DONE_LABEL);
 2063        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 2064        jmpb  (DONE_LABEL);
 2065     } else {
 2066        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 2067        jccb  (Assembler::notZero, DONE_LABEL);
 2068        movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 2069        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 2070        jccb  (Assembler::notZero, CheckSucc);
 2071        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 2072        jmpb  (DONE_LABEL);
 2073     }
 2074 
 2075     // The Following code fragment (EmitSync & 65536) improves the performance of
 2076     // contended applications and contended synchronization microbenchmarks.
 2077     // Unfortunately the emission of the code - even though not executed - causes regressions
 2078     // in scimark and jetstream, evidently because of $ effects.  Replacing the code
 2079     // with an equal number of never-executed NOPs results in the same regression.
 2080     // We leave it off by default.
 2081 
 2082     if ((EmitSync & 65536) != 0) {
 2083        Label LSuccess, LGoSlowPath ;
 2084 
 2085        bind  (CheckSucc);
 2086 
 2087        // Optional pre-test ... it's safe to elide this
 2088        cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 2089        jccb(Assembler::zero, LGoSlowPath);
 2090 
 2091        // We have a classic Dekker-style idiom:
 2092        //    ST m->_owner = 0 ; MEMBAR; LD m->_succ
 2093        // There are a number of ways to implement the barrier:
 2094        // (1) lock:andl &m->_owner, 0
 2095        //     is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
 2096        //     LOCK: ANDL [ebx+Offset(_Owner)-2], 0
 2097        //     Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
 2098        // (2) If supported, an explicit MFENCE is appealing.
 2099        //     In older IA32 processors MFENCE is slower than lock:add or xchg
 2100        //     particularly if the write-buffer is full as might be the case if
 2101        //     if stores closely precede the fence or fence-equivalent instruction.
 2102        //     See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 2103        //     as the situation has changed with Nehalem and Shanghai.
 2104        // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
 2105        //     The $lines underlying the top-of-stack should be in M-state.
 2106        //     The locked add instruction is serializing, of course.
 2107        // (4) Use xchg, which is serializing
 2108        //     mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
 2109        // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
 2110        //     The integer condition codes will tell us if succ was 0.
 2111        //     Since _succ and _owner should reside in the same $line and
 2112        //     we just stored into _owner, it's likely that the $line
 2113        //     remains in M-state for the lock:orl.
 2114        //
 2115        // We currently use (3), although it's likely that switching to (2)
 2116        // is correct for the future.
 2117 
 2118        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
 2119        if (os::is_MP()) {
 2120          lock(); addptr(Address(rsp, 0), 0);
 2121        }
 2122        // Ratify _succ remains non-null
 2123        cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), 0);
 2124        jccb  (Assembler::notZero, LSuccess);
 2125 
 2126        xorptr(boxReg, boxReg);                  // box is really EAX
 2127        if (os::is_MP()) { lock(); }
 2128        cmpxchgptr(rsp, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 2129        // There's no successor so we tried to regrab the lock with the
 2130        // placeholder value. If that didn't work, then another thread
 2131        // grabbed the lock so we're done (and exit was a success).
 2132        jccb  (Assembler::notEqual, LSuccess);
 2133        // Since we're low on registers we installed rsp as a placeholding in _owner.
 2134        // Now install Self over rsp.  This is safe as we're transitioning from
 2135        // non-null to non=null
 2136        get_thread (boxReg);
 2137        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), boxReg);
 2138        // Intentional fall-through into LGoSlowPath ...
 2139 
 2140        bind  (LGoSlowPath);
 2141        orptr(boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 2142        jmpb  (DONE_LABEL);
 2143 
 2144        bind  (LSuccess);
 2145        xorptr(boxReg, boxReg);                 // set ICC.ZF=1 to indicate success
 2146        jmpb  (DONE_LABEL);
 2147     }
 2148 
 2149     bind (Stacked);
 2150     // It's not inflated and it's not recursively stack-locked and it's not biased.
 2151     // It must be stack-locked.
 2152     // Try to reset the header to displaced header.
 2153     // The "box" value on the stack is stable, so we can reload
 2154     // and be assured we observe the same value as above.
 2155     movptr(tmpReg, Address(boxReg, 0));
 2156     if (os::is_MP()) {
 2157       lock();
 2158     }
 2159     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 2160     // Intention fall-thru into DONE_LABEL
 2161 
 2162     // DONE_LABEL is a hot target - we'd really like to place it at the
 2163     // start of cache line by padding with NOPs.
 2164     // See the AMD and Intel software optimization manuals for the
 2165     // most efficient "long" NOP encodings.
 2166     // Unfortunately none of our alignment mechanisms suffice.
 2167     if ((EmitSync & 65536) == 0) {
 2168        bind (CheckSucc);
 2169     }
 2170 #else // _LP64
 2171     // It's inflated
 2172     if (EmitSync & 1024) {
 2173       // Emit code to check that _owner == Self
 2174       // We could fold the _owner test into subsequent code more efficiently
 2175       // than using a stand-alone check, but since _owner checking is off by
 2176       // default we don't bother. We also might consider predicating the
 2177       // _owner==Self check on Xcheck:jni or running on a debug build.
 2178       movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 2179       xorptr(boxReg, r15_thread);
 2180     } else {
 2181       xorptr(boxReg, boxReg);
 2182     }
 2183     orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
 2184     jccb  (Assembler::notZero, DONE_LABEL);
 2185     movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
 2186     orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
 2187     jccb  (Assembler::notZero, CheckSucc);
 2188     movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 2189     jmpb  (DONE_LABEL);
 2190 
 2191     if ((EmitSync & 65536) == 0) {
 2192       // Try to avoid passing control into the slow_path ...
 2193       Label LSuccess, LGoSlowPath ;
 2194       bind  (CheckSucc);
 2195 
 2196       // The following optional optimization can be elided if necessary
 2197       // Effectively: if (succ == null) goto SlowPath
 2198       // The code reduces the window for a race, however,
 2199       // and thus benefits performance.
 2200       cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 2201       jccb  (Assembler::zero, LGoSlowPath);
 2202 
 2203       xorptr(boxReg, boxReg);
 2204       if ((EmitSync & 16) && os::is_MP()) {
 2205         xchgptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 2206       } else {
 2207         movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
 2208         if (os::is_MP()) {
 2209           // Memory barrier/fence
 2210           // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
 2211           // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
 2212           // This is faster on Nehalem and AMD Shanghai/Barcelona.
 2213           // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
 2214           // We might also restructure (ST Owner=0;barrier;LD _Succ) to
 2215           // (mov box,0; xchgq box, &m->Owner; LD _succ) .
 2216           lock(); addl(Address(rsp, 0), 0);
 2217         }
 2218       }
 2219       cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
 2220       jccb  (Assembler::notZero, LSuccess);
 2221 
 2222       // Rare inopportune interleaving - race.
 2223       // The successor vanished in the small window above.
 2224       // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
 2225       // We need to ensure progress and succession.
 2226       // Try to reacquire the lock.
 2227       // If that fails then the new owner is responsible for succession and this
 2228       // thread needs to take no further action and can exit via the fast path (success).
 2229       // If the re-acquire succeeds then pass control into the slow path.
 2230       // As implemented, this latter mode is horrible because we generated more
 2231       // coherence traffic on the lock *and* artifically extended the critical section
 2232       // length while by virtue of passing control into the slow path.
 2233 
 2234       // box is really RAX -- the following CMPXCHG depends on that binding
 2235       // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
 2236       if (os::is_MP()) { lock(); }
 2237       cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
 2238       // There's no successor so we tried to regrab the lock.
 2239       // If that didn't work, then another thread grabbed the
 2240       // lock so we're done (and exit was a success).
 2241       jccb  (Assembler::notEqual, LSuccess);
 2242       // Intentional fall-through into slow-path
 2243 
 2244       bind  (LGoSlowPath);
 2245       orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
 2246       jmpb  (DONE_LABEL);
 2247 
 2248       bind  (LSuccess);
 2249       testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
 2250       jmpb  (DONE_LABEL);
 2251     }
 2252 
 2253     bind  (Stacked);
 2254     movptr(tmpReg, Address (boxReg, 0));      // re-fetch
 2255     if (os::is_MP()) { lock(); }
 2256     cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
 2257 
 2258     if (EmitSync & 65536) {
 2259        bind (CheckSucc);
 2260     }
 2261 #endif
 2262     bind(DONE_LABEL);
 2263   }
 2264 }
 2265 #endif // COMPILER2
 2266 
 2267 void MacroAssembler::c2bool(Register x) {
 2268   // implements x == 0 ? 0 : 1
 2269   // note: must only look at least-significant byte of x
 2270   //       since C-style booleans are stored in one byte
 2271   //       only! (was bug)
 2272   andl(x, 0xFF);
 2273   setb(Assembler::notZero, x);
 2274 }
 2275 
 2276 // Wouldn't need if AddressLiteral version had new name
 2277 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
 2278   Assembler::call(L, rtype);
 2279 }
 2280 
 2281 void MacroAssembler::call(Register entry) {
 2282   Assembler::call(entry);
 2283 }
 2284 
 2285 void MacroAssembler::call(AddressLiteral entry) {
 2286   if (reachable(entry)) {
 2287     Assembler::call_literal(entry.target(), entry.rspec());
 2288   } else {
 2289     lea(rscratch1, entry);
 2290     Assembler::call(rscratch1);
 2291   }
 2292 }
 2293 
 2294 void MacroAssembler::ic_call(address entry, jint method_index) {
 2295   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 2296   movptr(rax, (intptr_t)Universe::non_oop_word());
 2297   call(AddressLiteral(entry, rh));
 2298 }
 2299 
 2300 // Implementation of call_VM versions
 2301 
 2302 void MacroAssembler::call_VM(Register oop_result,
 2303                              address entry_point,
 2304                              bool check_exceptions) {
 2305   Label C, E;
 2306   call(C, relocInfo::none);
 2307   jmp(E);
 2308 
 2309   bind(C);
 2310   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 2311   ret(0);
 2312 
 2313   bind(E);
 2314 }
 2315 
 2316 void MacroAssembler::call_VM(Register oop_result,
 2317                              address entry_point,
 2318                              Register arg_1,
 2319                              bool check_exceptions) {
 2320   Label C, E;
 2321   call(C, relocInfo::none);
 2322   jmp(E);
 2323 
 2324   bind(C);
 2325   pass_arg1(this, arg_1);
 2326   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 2327   ret(0);
 2328 
 2329   bind(E);
 2330 }
 2331 
 2332 void MacroAssembler::call_VM(Register oop_result,
 2333                              address entry_point,
 2334                              Register arg_1,
 2335                              Register arg_2,
 2336                              bool check_exceptions) {
 2337   Label C, E;
 2338   call(C, relocInfo::none);
 2339   jmp(E);
 2340 
 2341   bind(C);
 2342 
 2343   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
 2344 
 2345   pass_arg2(this, arg_2);
 2346   pass_arg1(this, arg_1);
 2347   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 2348   ret(0);
 2349 
 2350   bind(E);
 2351 }
 2352 
 2353 void MacroAssembler::call_VM(Register oop_result,
 2354                              address entry_point,
 2355                              Register arg_1,
 2356                              Register arg_2,
 2357                              Register arg_3,
 2358                              bool check_exceptions) {
 2359   Label C, E;
 2360   call(C, relocInfo::none);
 2361   jmp(E);
 2362 
 2363   bind(C);
 2364 
 2365   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
 2366   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
 2367   pass_arg3(this, arg_3);
 2368 
 2369   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
 2370   pass_arg2(this, arg_2);
 2371 
 2372   pass_arg1(this, arg_1);
 2373   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 2374   ret(0);
 2375 
 2376   bind(E);
 2377 }
 2378 
 2379 void MacroAssembler::call_VM(Register oop_result,
 2380                              Register last_java_sp,
 2381                              address entry_point,
 2382                              int number_of_arguments,
 2383                              bool check_exceptions) {
 2384   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
 2385   call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 2386 }
 2387 
 2388 void MacroAssembler::call_VM(Register oop_result,
 2389                              Register last_java_sp,
 2390                              address entry_point,
 2391                              Register arg_1,
 2392                              bool check_exceptions) {
 2393   pass_arg1(this, arg_1);
 2394   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 2395 }
 2396 
 2397 void MacroAssembler::call_VM(Register oop_result,
 2398                              Register last_java_sp,
 2399                              address entry_point,
 2400                              Register arg_1,
 2401                              Register arg_2,
 2402                              bool check_exceptions) {
 2403 
 2404   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
 2405   pass_arg2(this, arg_2);
 2406   pass_arg1(this, arg_1);
 2407   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 2408 }
 2409 
 2410 void MacroAssembler::call_VM(Register oop_result,
 2411                              Register last_java_sp,
 2412                              address entry_point,
 2413                              Register arg_1,
 2414                              Register arg_2,
 2415                              Register arg_3,
 2416                              bool check_exceptions) {
 2417   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
 2418   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
 2419   pass_arg3(this, arg_3);
 2420   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
 2421   pass_arg2(this, arg_2);
 2422   pass_arg1(this, arg_1);
 2423   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 2424 }
 2425 
 2426 void MacroAssembler::super_call_VM(Register oop_result,
 2427                                    Register last_java_sp,
 2428                                    address entry_point,
 2429                                    int number_of_arguments,
 2430                                    bool check_exceptions) {
 2431   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
 2432   MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 2433 }
 2434 
 2435 void MacroAssembler::super_call_VM(Register oop_result,
 2436                                    Register last_java_sp,
 2437                                    address entry_point,
 2438                                    Register arg_1,
 2439                                    bool check_exceptions) {
 2440   pass_arg1(this, arg_1);
 2441   super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 2442 }
 2443 
 2444 void MacroAssembler::super_call_VM(Register oop_result,
 2445                                    Register last_java_sp,
 2446                                    address entry_point,
 2447                                    Register arg_1,
 2448                                    Register arg_2,
 2449                                    bool check_exceptions) {
 2450 
 2451   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
 2452   pass_arg2(this, arg_2);
 2453   pass_arg1(this, arg_1);
 2454   super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 2455 }
 2456 
 2457 void MacroAssembler::super_call_VM(Register oop_result,
 2458                                    Register last_java_sp,
 2459                                    address entry_point,
 2460                                    Register arg_1,
 2461                                    Register arg_2,
 2462                                    Register arg_3,
 2463                                    bool check_exceptions) {
 2464   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
 2465   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
 2466   pass_arg3(this, arg_3);
 2467   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
 2468   pass_arg2(this, arg_2);
 2469   pass_arg1(this, arg_1);
 2470   super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 2471 }
 2472 
 2473 void MacroAssembler::call_VM_base(Register oop_result,
 2474                                   Register java_thread,
 2475                                   Register last_java_sp,
 2476                                   address  entry_point,
 2477                                   int      number_of_arguments,
 2478                                   bool     check_exceptions) {
 2479   // determine java_thread register
 2480   if (!java_thread->is_valid()) {
 2481 #ifdef _LP64
 2482     java_thread = r15_thread;
 2483 #else
 2484     java_thread = rdi;
 2485     get_thread(java_thread);
 2486 #endif // LP64
 2487   }
 2488   // determine last_java_sp register
 2489   if (!last_java_sp->is_valid()) {
 2490     last_java_sp = rsp;
 2491   }
 2492   // debugging support
 2493   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 2494   LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
 2495 #ifdef ASSERT
 2496   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 2497   // r12 is the heapbase.
 2498   LP64_ONLY(if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
 2499 #endif // ASSERT
 2500 
 2501   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 2502   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 2503 
 2504   // push java thread (becomes first argument of C function)
 2505 
 2506   NOT_LP64(push(java_thread); number_of_arguments++);
 2507   LP64_ONLY(mov(c_rarg0, r15_thread));
 2508 
 2509   // set last Java frame before call
 2510   assert(last_java_sp != rbp, "can't use ebp/rbp");
 2511 
 2512   // Only interpreter should have to set fp
 2513   set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
 2514 
 2515   // do the call, remove parameters
 2516   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
 2517 
 2518   // restore the thread (cannot use the pushed argument since arguments
 2519   // may be overwritten by C code generated by an optimizing compiler);
 2520   // however can use the register value directly if it is callee saved.
 2521   if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
 2522     // rdi & rsi (also r15) are callee saved -> nothing to do
 2523 #ifdef ASSERT
 2524     guarantee(java_thread != rax, "change this code");
 2525     push(rax);
 2526     { Label L;
 2527       get_thread(rax);
 2528       cmpptr(java_thread, rax);
 2529       jcc(Assembler::equal, L);
 2530       STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
 2531       bind(L);
 2532     }
 2533     pop(rax);
 2534 #endif
 2535   } else {
 2536     get_thread(java_thread);
 2537   }
 2538   // reset last Java frame
 2539   // Only interpreter should have to clear fp
 2540   reset_last_Java_frame(java_thread, true);
 2541 
 2542    // C++ interp handles this in the interpreter
 2543   check_and_handle_popframe(java_thread);
 2544   check_and_handle_earlyret(java_thread);
 2545 
 2546   if (check_exceptions) {
 2547     // check for pending exceptions (java_thread is set upon return)
 2548     cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
 2549 #ifndef _LP64
 2550     jump_cc(Assembler::notEqual,
 2551             RuntimeAddress(StubRoutines::forward_exception_entry()));
 2552 #else
 2553     // This used to conditionally jump to forward_exception however it is
 2554     // possible if we relocate that the branch will not reach. So we must jump
 2555     // around so we can always reach
 2556 
 2557     Label ok;
 2558     jcc(Assembler::equal, ok);
 2559     jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
 2560     bind(ok);
 2561 #endif // LP64
 2562   }
 2563 
 2564   // get oop result if there is one and reset the value in the thread
 2565   if (oop_result->is_valid()) {
 2566     get_vm_result(oop_result, java_thread);
 2567   }
 2568 }
 2569 
 2570 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 2571 
 2572   // Calculate the value for last_Java_sp
 2573   // somewhat subtle. call_VM does an intermediate call
 2574   // which places a return address on the stack just under the
 2575   // stack pointer as the user finsihed with it. This allows
 2576   // use to retrieve last_Java_pc from last_Java_sp[-1].
 2577   // On 32bit we then have to push additional args on the stack to accomplish
 2578   // the actual requested call. On 64bit call_VM only can use register args
 2579   // so the only extra space is the return address that call_VM created.
 2580   // This hopefully explains the calculations here.
 2581 
 2582 #ifdef _LP64
 2583   // We've pushed one address, correct last_Java_sp
 2584   lea(rax, Address(rsp, wordSize));
 2585 #else
 2586   lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
 2587 #endif // LP64
 2588 
 2589   call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
 2590 
 2591 }
 2592 
 2593 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
 2594 void MacroAssembler::call_VM_leaf0(address entry_point) {
 2595   MacroAssembler::call_VM_leaf_base(entry_point, 0);
 2596 }
 2597 
 2598 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
 2599   call_VM_leaf_base(entry_point, number_of_arguments);
 2600 }
 2601 
 2602 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
 2603   pass_arg0(this, arg_0);
 2604   call_VM_leaf(entry_point, 1);
 2605 }
 2606 
 2607 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 2608 
 2609   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
 2610   pass_arg1(this, arg_1);
 2611   pass_arg0(this, arg_0);
 2612   call_VM_leaf(entry_point, 2);
 2613 }
 2614 
 2615 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
 2616   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
 2617   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
 2618   pass_arg2(this, arg_2);
 2619   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
 2620   pass_arg1(this, arg_1);
 2621   pass_arg0(this, arg_0);
 2622   call_VM_leaf(entry_point, 3);
 2623 }
 2624 
 2625 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
 2626   pass_arg0(this, arg_0);
 2627   MacroAssembler::call_VM_leaf_base(entry_point, 1);
 2628 }
 2629 
 2630 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
 2631 
 2632   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
 2633   pass_arg1(this, arg_1);
 2634   pass_arg0(this, arg_0);
 2635   MacroAssembler::call_VM_leaf_base(entry_point, 2);
 2636 }
 2637 
 2638 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
 2639   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
 2640   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
 2641   pass_arg2(this, arg_2);
 2642   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
 2643   pass_arg1(this, arg_1);
 2644   pass_arg0(this, arg_0);
 2645   MacroAssembler::call_VM_leaf_base(entry_point, 3);
 2646 }
 2647 
 2648 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
 2649   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
 2650   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
 2651   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
 2652   pass_arg3(this, arg_3);
 2653   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
 2654   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
 2655   pass_arg2(this, arg_2);
 2656   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
 2657   pass_arg1(this, arg_1);
 2658   pass_arg0(this, arg_0);
 2659   MacroAssembler::call_VM_leaf_base(entry_point, 4);
 2660 }
 2661 
 2662 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 2663   movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 2664   movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
 2665   verify_oop(oop_result, "broken oop in call_VM_base");
 2666 }
 2667 
 2668 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 2669   movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 2670   movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
 2671 }
 2672 
 2673 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
 2674 }
 2675 
 2676 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
 2677 }
 2678 
 2679 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
 2680   if (reachable(src1)) {
 2681     cmpl(as_Address(src1), imm);
 2682   } else {
 2683     lea(rscratch1, src1);
 2684     cmpl(Address(rscratch1, 0), imm);
 2685   }
 2686 }
 2687 
 2688 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
 2689   assert(!src2.is_lval(), "use cmpptr");
 2690   if (reachable(src2)) {
 2691     cmpl(src1, as_Address(src2));
 2692   } else {
 2693     lea(rscratch1, src2);
 2694     cmpl(src1, Address(rscratch1, 0));
 2695   }
 2696 }
 2697 
 2698 void MacroAssembler::cmp32(Register src1, int32_t imm) {
 2699   Assembler::cmpl(src1, imm);
 2700 }
 2701 
 2702 void MacroAssembler::cmp32(Register src1, Address src2) {
 2703   Assembler::cmpl(src1, src2);
 2704 }
 2705 
 2706 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
 2707   ucomisd(opr1, opr2);
 2708 
 2709   Label L;
 2710   if (unordered_is_less) {
 2711     movl(dst, -1);
 2712     jcc(Assembler::parity, L);
 2713     jcc(Assembler::below , L);
 2714     movl(dst, 0);
 2715     jcc(Assembler::equal , L);
 2716     increment(dst);
 2717   } else { // unordered is greater
 2718     movl(dst, 1);
 2719     jcc(Assembler::parity, L);
 2720     jcc(Assembler::above , L);
 2721     movl(dst, 0);
 2722     jcc(Assembler::equal , L);
 2723     decrementl(dst);
 2724   }
 2725   bind(L);
 2726 }
 2727 
 2728 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
 2729   ucomiss(opr1, opr2);
 2730 
 2731   Label L;
 2732   if (unordered_is_less) {
 2733     movl(dst, -1);
 2734     jcc(Assembler::parity, L);
 2735     jcc(Assembler::below , L);
 2736     movl(dst, 0);
 2737     jcc(Assembler::equal , L);
 2738     increment(dst);
 2739   } else { // unordered is greater
 2740     movl(dst, 1);
 2741     jcc(Assembler::parity, L);
 2742     jcc(Assembler::above , L);
 2743     movl(dst, 0);
 2744     jcc(Assembler::equal , L);
 2745     decrementl(dst);
 2746   }
 2747   bind(L);
 2748 }
 2749 
 2750 
 2751 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
 2752   if (reachable(src1)) {
 2753     cmpb(as_Address(src1), imm);
 2754   } else {
 2755     lea(rscratch1, src1);
 2756     cmpb(Address(rscratch1, 0), imm);
 2757   }
 2758 }
 2759 
 2760 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
 2761 #ifdef _LP64
 2762   if (src2.is_lval()) {
 2763     movptr(rscratch1, src2);
 2764     Assembler::cmpq(src1, rscratch1);
 2765   } else if (reachable(src2)) {
 2766     cmpq(src1, as_Address(src2));
 2767   } else {
 2768     lea(rscratch1, src2);
 2769     Assembler::cmpq(src1, Address(rscratch1, 0));
 2770   }
 2771 #else
 2772   if (src2.is_lval()) {
 2773     cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
 2774   } else {
 2775     cmpl(src1, as_Address(src2));
 2776   }
 2777 #endif // _LP64
 2778 }
 2779 
 2780 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
 2781   assert(src2.is_lval(), "not a mem-mem compare");
 2782 #ifdef _LP64
 2783   // moves src2's literal address
 2784   movptr(rscratch1, src2);
 2785   Assembler::cmpq(src1, rscratch1);
 2786 #else
 2787   cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
 2788 #endif // _LP64
 2789 }
 2790 
 2791 void MacroAssembler::cmpoop(Register src1, Register src2) {
 2792   cmpptr(src1, src2);
 2793 }
 2794 
 2795 void MacroAssembler::cmpoop(Register src1, Address src2) {
 2796   cmpptr(src1, src2);
 2797 }
 2798 
 2799 #ifdef _LP64
 2800 void MacroAssembler::cmpoop(Register src1, jobject src2) {
 2801   movoop(rscratch1, src2);
 2802   cmpptr(src1, rscratch1);
 2803 }
 2804 #endif
 2805 
 2806 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
 2807   if (reachable(adr)) {
 2808     if (os::is_MP())
 2809       lock();
 2810     cmpxchgptr(reg, as_Address(adr));
 2811   } else {
 2812     lea(rscratch1, adr);
 2813     if (os::is_MP())
 2814       lock();
 2815     cmpxchgptr(reg, Address(rscratch1, 0));
 2816   }
 2817 }
 2818 
 2819 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
 2820   LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
 2821 }
 2822 
 2823 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
 2824   if (reachable(src)) {
 2825     Assembler::comisd(dst, as_Address(src));
 2826   } else {
 2827     lea(rscratch1, src);
 2828     Assembler::comisd(dst, Address(rscratch1, 0));
 2829   }
 2830 }
 2831 
 2832 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
 2833   if (reachable(src)) {
 2834     Assembler::comiss(dst, as_Address(src));
 2835   } else {
 2836     lea(rscratch1, src);
 2837     Assembler::comiss(dst, Address(rscratch1, 0));
 2838   }
 2839 }
 2840 
 2841 
 2842 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
 2843   Condition negated_cond = negate_condition(cond);
 2844   Label L;
 2845   jcc(negated_cond, L);
 2846   pushf(); // Preserve flags
 2847   atomic_incl(counter_addr);
 2848   popf();
 2849   bind(L);
 2850 }
 2851 
 2852 int MacroAssembler::corrected_idivl(Register reg) {
 2853   // Full implementation of Java idiv and irem; checks for
 2854   // special case as described in JVM spec., p.243 & p.271.
 2855   // The function returns the (pc) offset of the idivl
 2856   // instruction - may be needed for implicit exceptions.
 2857   //
 2858   //         normal case                           special case
 2859   //
 2860   // input : rax,: dividend                         min_int
 2861   //         reg: divisor   (may not be rax,/rdx)   -1
 2862   //
 2863   // output: rax,: quotient  (= rax, idiv reg)       min_int
 2864   //         rdx: remainder (= rax, irem reg)       0
 2865   assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
 2866   const int min_int = 0x80000000;
 2867   Label normal_case, special_case;
 2868 
 2869   // check for special case
 2870   cmpl(rax, min_int);
 2871   jcc(Assembler::notEqual, normal_case);
 2872   xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
 2873   cmpl(reg, -1);
 2874   jcc(Assembler::equal, special_case);
 2875 
 2876   // handle normal case
 2877   bind(normal_case);
 2878   cdql();
 2879   int idivl_offset = offset();
 2880   idivl(reg);
 2881 
 2882   // normal and special case exit
 2883   bind(special_case);
 2884 
 2885   return idivl_offset;
 2886 }
 2887 
 2888 
 2889 
 2890 void MacroAssembler::decrementl(Register reg, int value) {
 2891   if (value == min_jint) {subl(reg, value) ; return; }
 2892   if (value <  0) { incrementl(reg, -value); return; }
 2893   if (value == 0) {                        ; return; }
 2894   if (value == 1 && UseIncDec) { decl(reg) ; return; }
 2895   /* else */      { subl(reg, value)       ; return; }
 2896 }
 2897 
 2898 void MacroAssembler::decrementl(Address dst, int value) {
 2899   if (value == min_jint) {subl(dst, value) ; return; }
 2900   if (value <  0) { incrementl(dst, -value); return; }
 2901   if (value == 0) {                        ; return; }
 2902   if (value == 1 && UseIncDec) { decl(dst) ; return; }
 2903   /* else */      { subl(dst, value)       ; return; }
 2904 }
 2905 
 2906 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
 2907   assert (shift_value > 0, "illegal shift value");
 2908   Label _is_positive;
 2909   testl (reg, reg);
 2910   jcc (Assembler::positive, _is_positive);
 2911   int offset = (1 << shift_value) - 1 ;
 2912 
 2913   if (offset == 1) {
 2914     incrementl(reg);
 2915   } else {
 2916     addl(reg, offset);
 2917   }
 2918 
 2919   bind (_is_positive);
 2920   sarl(reg, shift_value);
 2921 }
 2922 
 2923 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
 2924   if (reachable(src)) {
 2925     Assembler::divsd(dst, as_Address(src));
 2926   } else {
 2927     lea(rscratch1, src);
 2928     Assembler::divsd(dst, Address(rscratch1, 0));
 2929   }
 2930 }
 2931 
 2932 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
 2933   if (reachable(src)) {
 2934     Assembler::divss(dst, as_Address(src));
 2935   } else {
 2936     lea(rscratch1, src);
 2937     Assembler::divss(dst, Address(rscratch1, 0));
 2938   }
 2939 }
 2940 
 2941 // !defined(COMPILER2) is because of stupid core builds
 2942 #if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2) || INCLUDE_JVMCI
 2943 void MacroAssembler::empty_FPU_stack() {
 2944   if (VM_Version::supports_mmx()) {
 2945     emms();
 2946   } else {
 2947     for (int i = 8; i-- > 0; ) ffree(i);
 2948   }
 2949 }
 2950 #endif // !LP64 || C1 || !C2 || INCLUDE_JVMCI
 2951 
 2952 
 2953 // Defines obj, preserves var_size_in_bytes
 2954 void MacroAssembler::eden_allocate(Register obj,
 2955                                    Register var_size_in_bytes,
 2956                                    int con_size_in_bytes,
 2957                                    Register t1,
 2958                                    Label& slow_case) {
 2959   assert(obj == rax, "obj must be in rax, for cmpxchg");
 2960   assert_different_registers(obj, var_size_in_bytes, t1);
 2961   if (!Universe::heap()->supports_inline_contig_alloc()) {
 2962     jmp(slow_case);
 2963   } else {
 2964     Register end = t1;
 2965     Label retry;
 2966     bind(retry);
 2967     ExternalAddress heap_top((address) Universe::heap()->top_addr());
 2968     movptr(obj, heap_top);
 2969     if (var_size_in_bytes == noreg) {
 2970       lea(end, Address(obj, con_size_in_bytes));
 2971     } else {
 2972       lea(end, Address(obj, var_size_in_bytes, Address::times_1));
 2973     }
 2974     // if end < obj then we wrapped around => object too long => slow case
 2975     cmpptr(end, obj);
 2976     jcc(Assembler::below, slow_case);
 2977     cmpptr(end, ExternalAddress((address) Universe::heap()->end_addr()));
 2978     jcc(Assembler::above, slow_case);
 2979     // Compare obj with the top addr, and if still equal, store the new top addr in
 2980     // end at the address of the top addr pointer. Sets ZF if was equal, and clears
 2981     // it otherwise. Use lock prefix for atomicity on MPs.
 2982     locked_cmpxchgptr(end, heap_top);
 2983     jcc(Assembler::notEqual, retry);
 2984   }
 2985 }
 2986 
 2987 void MacroAssembler::enter() {
 2988   push(rbp);
 2989   mov(rbp, rsp);
 2990 }
 2991 
 2992 // A 5 byte nop that is safe for patching (see patch_verified_entry)
 2993 void MacroAssembler::fat_nop() {
 2994   if (UseAddressNop) {
 2995     addr_nop_5();
 2996   } else {
 2997     emit_int8(0x26); // es:
 2998     emit_int8(0x2e); // cs:
 2999     emit_int8(0x64); // fs:
 3000     emit_int8(0x65); // gs:
 3001     emit_int8((unsigned char)0x90);
 3002   }
 3003 }
 3004 
 3005 void MacroAssembler::fcmp(Register tmp) {
 3006   fcmp(tmp, 1, true, true);
 3007 }
 3008 
 3009 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
 3010   assert(!pop_right || pop_left, "usage error");
 3011   if (VM_Version::supports_cmov()) {
 3012     assert(tmp == noreg, "unneeded temp");
 3013     if (pop_left) {
 3014       fucomip(index);
 3015     } else {
 3016       fucomi(index);
 3017     }
 3018     if (pop_right) {
 3019       fpop();
 3020     }
 3021   } else {
 3022     assert(tmp != noreg, "need temp");
 3023     if (pop_left) {
 3024       if (pop_right) {
 3025         fcompp();
 3026       } else {
 3027         fcomp(index);
 3028       }
 3029     } else {
 3030       fcom(index);
 3031     }
 3032     // convert FPU condition into eflags condition via rax,
 3033     save_rax(tmp);
 3034     fwait(); fnstsw_ax();
 3035     sahf();
 3036     restore_rax(tmp);
 3037   }
 3038   // condition codes set as follows:
 3039   //
 3040   // CF (corresponds to C0) if x < y
 3041   // PF (corresponds to C2) if unordered
 3042   // ZF (corresponds to C3) if x = y
 3043 }
 3044 
 3045 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
 3046   fcmp2int(dst, unordered_is_less, 1, true, true);
 3047 }
 3048 
 3049 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
 3050   fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
 3051   Label L;
 3052   if (unordered_is_less) {
 3053     movl(dst, -1);
 3054     jcc(Assembler::parity, L);
 3055     jcc(Assembler::below , L);
 3056     movl(dst, 0);
 3057     jcc(Assembler::equal , L);
 3058     increment(dst);
 3059   } else { // unordered is greater
 3060     movl(dst, 1);
 3061     jcc(Assembler::parity, L);
 3062     jcc(Assembler::above , L);
 3063     movl(dst, 0);
 3064     jcc(Assembler::equal , L);
 3065     decrementl(dst);
 3066   }
 3067   bind(L);
 3068 }
 3069 
 3070 void MacroAssembler::fld_d(AddressLiteral src) {
 3071   fld_d(as_Address(src));
 3072 }
 3073 
 3074 void MacroAssembler::fld_s(AddressLiteral src) {
 3075   fld_s(as_Address(src));
 3076 }
 3077 
 3078 void MacroAssembler::fld_x(AddressLiteral src) {
 3079   Assembler::fld_x(as_Address(src));
 3080 }
 3081 
 3082 void MacroAssembler::fldcw(AddressLiteral src) {
 3083   Assembler::fldcw(as_Address(src));
 3084 }
 3085 
 3086 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
 3087   if (reachable(src)) {
 3088     Assembler::mulpd(dst, as_Address(src));
 3089   } else {
 3090     lea(rscratch1, src);
 3091     Assembler::mulpd(dst, Address(rscratch1, 0));
 3092   }
 3093 }
 3094 
 3095 void MacroAssembler::increase_precision() {
 3096   subptr(rsp, BytesPerWord);
 3097   fnstcw(Address(rsp, 0));
 3098   movl(rax, Address(rsp, 0));
 3099   orl(rax, 0x300);
 3100   push(rax);
 3101   fldcw(Address(rsp, 0));
 3102   pop(rax);
 3103 }
 3104 
 3105 void MacroAssembler::restore_precision() {
 3106   fldcw(Address(rsp, 0));
 3107   addptr(rsp, BytesPerWord);
 3108 }
 3109 
 3110 void MacroAssembler::fpop() {
 3111   ffree();
 3112   fincstp();
 3113 }
 3114 
 3115 void MacroAssembler::load_float(Address src) {
 3116   if (UseSSE >= 1) {
 3117     movflt(xmm0, src);
 3118   } else {
 3119     LP64_ONLY(ShouldNotReachHere());
 3120     NOT_LP64(fld_s(src));
 3121   }
 3122 }
 3123 
 3124 void MacroAssembler::store_float(Address dst) {
 3125   if (UseSSE >= 1) {
 3126     movflt(dst, xmm0);
 3127   } else {
 3128     LP64_ONLY(ShouldNotReachHere());
 3129     NOT_LP64(fstp_s(dst));
 3130   }
 3131 }
 3132 
 3133 void MacroAssembler::load_double(Address src) {
 3134   if (UseSSE >= 2) {
 3135     movdbl(xmm0, src);
 3136   } else {
 3137     LP64_ONLY(ShouldNotReachHere());
 3138     NOT_LP64(fld_d(src));
 3139   }
 3140 }
 3141 
 3142 void MacroAssembler::store_double(Address dst) {
 3143   if (UseSSE >= 2) {
 3144     movdbl(dst, xmm0);
 3145   } else {
 3146     LP64_ONLY(ShouldNotReachHere());
 3147     NOT_LP64(fstp_d(dst));
 3148   }
 3149 }
 3150 
 3151 void MacroAssembler::fremr(Register tmp) {
 3152   save_rax(tmp);
 3153   { Label L;
 3154     bind(L);
 3155     fprem();
 3156     fwait(); fnstsw_ax();
 3157 #ifdef _LP64
 3158     testl(rax, 0x400);
 3159     jcc(Assembler::notEqual, L);
 3160 #else
 3161     sahf();
 3162     jcc(Assembler::parity, L);
 3163 #endif // _LP64
 3164   }
 3165   restore_rax(tmp);
 3166   // Result is in ST0.
 3167   // Note: fxch & fpop to get rid of ST1
 3168   // (otherwise FPU stack could overflow eventually)
 3169   fxch(1);
 3170   fpop();
 3171 }
 3172 
 3173 // dst = c = a * b + c
 3174 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
 3175   Assembler::vfmadd231sd(c, a, b);
 3176   if (dst != c) {
 3177     movdbl(dst, c);
 3178   }
 3179 }
 3180 
 3181 // dst = c = a * b + c
 3182 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
 3183   Assembler::vfmadd231ss(c, a, b);
 3184   if (dst != c) {
 3185     movflt(dst, c);
 3186   }
 3187 }
 3188 
 3189 // dst = c = a * b + c
 3190 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
 3191   Assembler::vfmadd231pd(c, a, b, vector_len);
 3192   if (dst != c) {
 3193     vmovdqu(dst, c);
 3194   }
 3195 }
 3196 
 3197 // dst = c = a * b + c
 3198 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
 3199   Assembler::vfmadd231ps(c, a, b, vector_len);
 3200   if (dst != c) {
 3201     vmovdqu(dst, c);
 3202   }
 3203 }
 3204 
 3205 // dst = c = a * b + c
 3206 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
 3207   Assembler::vfmadd231pd(c, a, b, vector_len);
 3208   if (dst != c) {
 3209     vmovdqu(dst, c);
 3210   }
 3211 }
 3212 
 3213 // dst = c = a * b + c
 3214 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
 3215   Assembler::vfmadd231ps(c, a, b, vector_len);
 3216   if (dst != c) {
 3217     vmovdqu(dst, c);
 3218   }
 3219 }
 3220 
 3221 void MacroAssembler::incrementl(AddressLiteral dst) {
 3222   if (reachable(dst)) {
 3223     incrementl(as_Address(dst));
 3224   } else {
 3225     lea(rscratch1, dst);
 3226     incrementl(Address(rscratch1, 0));
 3227   }
 3228 }
 3229 
 3230 void MacroAssembler::incrementl(ArrayAddress dst) {
 3231   incrementl(as_Address(dst));
 3232 }
 3233 
 3234 void MacroAssembler::incrementl(Register reg, int value) {
 3235   if (value == min_jint) {addl(reg, value) ; return; }
 3236   if (value <  0) { decrementl(reg, -value); return; }
 3237   if (value == 0) {                        ; return; }
 3238   if (value == 1 && UseIncDec) { incl(reg) ; return; }
 3239   /* else */      { addl(reg, value)       ; return; }
 3240 }
 3241 
 3242 void MacroAssembler::incrementl(Address dst, int value) {
 3243   if (value == min_jint) {addl(dst, value) ; return; }
 3244   if (value <  0) { decrementl(dst, -value); return; }
 3245   if (value == 0) {                        ; return; }
 3246   if (value == 1 && UseIncDec) { incl(dst) ; return; }
 3247   /* else */      { addl(dst, value)       ; return; }
 3248 }
 3249 
 3250 void MacroAssembler::jump(AddressLiteral dst) {
 3251   if (reachable(dst)) {
 3252     jmp_literal(dst.target(), dst.rspec());
 3253   } else {
 3254     lea(rscratch1, dst);
 3255     jmp(rscratch1);
 3256   }
 3257 }
 3258 
 3259 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
 3260   if (reachable(dst)) {
 3261     InstructionMark im(this);
 3262     relocate(dst.reloc());
 3263     const int short_size = 2;
 3264     const int long_size = 6;
 3265     int offs = (intptr_t)dst.target() - ((intptr_t)pc());
 3266     if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
 3267       // 0111 tttn #8-bit disp
 3268       emit_int8(0x70 | cc);
 3269       emit_int8((offs - short_size) & 0xFF);
 3270     } else {
 3271       // 0000 1111 1000 tttn #32-bit disp
 3272       emit_int8(0x0F);
 3273       emit_int8((unsigned char)(0x80 | cc));
 3274       emit_int32(offs - long_size);
 3275     }
 3276   } else {
 3277 #ifdef ASSERT
 3278     warning("reversing conditional branch");
 3279 #endif /* ASSERT */
 3280     Label skip;
 3281     jccb(reverse[cc], skip);
 3282     lea(rscratch1, dst);
 3283     Assembler::jmp(rscratch1);
 3284     bind(skip);
 3285   }
 3286 }
 3287 
 3288 void MacroAssembler::ldmxcsr(AddressLiteral src) {
 3289   if (reachable(src)) {
 3290     Assembler::ldmxcsr(as_Address(src));
 3291   } else {
 3292     lea(rscratch1, src);
 3293     Assembler::ldmxcsr(Address(rscratch1, 0));
 3294   }
 3295 }
 3296 
 3297 int MacroAssembler::load_signed_byte(Register dst, Address src) {
 3298   int off;
 3299   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
 3300     off = offset();
 3301     movsbl(dst, src); // movsxb
 3302   } else {
 3303     off = load_unsigned_byte(dst, src);
 3304     shll(dst, 24);
 3305     sarl(dst, 24);
 3306   }
 3307   return off;
 3308 }
 3309 
 3310 // Note: load_signed_short used to be called load_signed_word.
 3311 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
 3312 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
 3313 // The term "word" in HotSpot means a 32- or 64-bit machine word.
 3314 int MacroAssembler::load_signed_short(Register dst, Address src) {
 3315   int off;
 3316   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
 3317     // This is dubious to me since it seems safe to do a signed 16 => 64 bit
 3318     // version but this is what 64bit has always done. This seems to imply
 3319     // that users are only using 32bits worth.
 3320     off = offset();
 3321     movswl(dst, src); // movsxw
 3322   } else {
 3323     off = load_unsigned_short(dst, src);
 3324     shll(dst, 16);
 3325     sarl(dst, 16);
 3326   }
 3327   return off;
 3328 }
 3329 
 3330 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
 3331   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
 3332   // and "3.9 Partial Register Penalties", p. 22).
 3333   int off;
 3334   if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
 3335     off = offset();
 3336     movzbl(dst, src); // movzxb
 3337   } else {
 3338     xorl(dst, dst);
 3339     off = offset();
 3340     movb(dst, src);
 3341   }
 3342   return off;
 3343 }
 3344 
 3345 // Note: load_unsigned_short used to be called load_unsigned_word.
 3346 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
 3347   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
 3348   // and "3.9 Partial Register Penalties", p. 22).
 3349   int off;
 3350   if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
 3351     off = offset();
 3352     movzwl(dst, src); // movzxw
 3353   } else {
 3354     xorl(dst, dst);
 3355     off = offset();
 3356     movw(dst, src);
 3357   }
 3358   return off;
 3359 }
 3360 
 3361 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
 3362   switch (size_in_bytes) {
 3363 #ifndef _LP64
 3364   case  8:
 3365     assert(dst2 != noreg, "second dest register required");
 3366     movl(dst,  src);
 3367     movl(dst2, src.plus_disp(BytesPerInt));
 3368     break;
 3369 #else
 3370   case  8:  movq(dst, src); break;
 3371 #endif
 3372   case  4:  movl(dst, src); break;
 3373   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
 3374   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
 3375   default:  ShouldNotReachHere();
 3376   }
 3377 }
 3378 
 3379 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
 3380   switch (size_in_bytes) {
 3381 #ifndef _LP64
 3382   case  8:
 3383     assert(src2 != noreg, "second source register required");
 3384     movl(dst,                        src);
 3385     movl(dst.plus_disp(BytesPerInt), src2);
 3386     break;
 3387 #else
 3388   case  8:  movq(dst, src); break;
 3389 #endif
 3390   case  4:  movl(dst, src); break;
 3391   case  2:  movw(dst, src); break;
 3392   case  1:  movb(dst, src); break;
 3393   default:  ShouldNotReachHere();
 3394   }
 3395 }
 3396 
 3397 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
 3398   if (reachable(dst)) {
 3399     movl(as_Address(dst), src);
 3400   } else {
 3401     lea(rscratch1, dst);
 3402     movl(Address(rscratch1, 0), src);
 3403   }
 3404 }
 3405 
 3406 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
 3407   if (reachable(src)) {
 3408     movl(dst, as_Address(src));
 3409   } else {
 3410     lea(rscratch1, src);
 3411     movl(dst, Address(rscratch1, 0));
 3412   }
 3413 }
 3414 
 3415 // C++ bool manipulation
 3416 
 3417 void MacroAssembler::movbool(Register dst, Address src) {
 3418   if(sizeof(bool) == 1)
 3419     movb(dst, src);
 3420   else if(sizeof(bool) == 2)
 3421     movw(dst, src);
 3422   else if(sizeof(bool) == 4)
 3423     movl(dst, src);
 3424   else
 3425     // unsupported
 3426     ShouldNotReachHere();
 3427 }
 3428 
 3429 void MacroAssembler::movbool(Address dst, bool boolconst) {
 3430   if(sizeof(bool) == 1)
 3431     movb(dst, (int) boolconst);
 3432   else if(sizeof(bool) == 2)
 3433     movw(dst, (int) boolconst);
 3434   else if(sizeof(bool) == 4)
 3435     movl(dst, (int) boolconst);
 3436   else
 3437     // unsupported
 3438     ShouldNotReachHere();
 3439 }
 3440 
 3441 void MacroAssembler::movbool(Address dst, Register src) {
 3442   if(sizeof(bool) == 1)
 3443     movb(dst, src);
 3444   else if(sizeof(bool) == 2)
 3445     movw(dst, src);
 3446   else if(sizeof(bool) == 4)
 3447     movl(dst, src);
 3448   else
 3449     // unsupported
 3450     ShouldNotReachHere();
 3451 }
 3452 
 3453 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
 3454   movb(as_Address(dst), src);
 3455 }
 3456 
 3457 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
 3458   if (reachable(src)) {
 3459     movdl(dst, as_Address(src));
 3460   } else {
 3461     lea(rscratch1, src);
 3462     movdl(dst, Address(rscratch1, 0));
 3463   }
 3464 }
 3465 
 3466 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
 3467   if (reachable(src)) {
 3468     movq(dst, as_Address(src));
 3469   } else {
 3470     lea(rscratch1, src);
 3471     movq(dst, Address(rscratch1, 0));
 3472   }
 3473 }
 3474 
 3475 void MacroAssembler::setvectmask(Register dst, Register src) {
 3476   Assembler::movl(dst, 1);
 3477   Assembler::shlxl(dst, dst, src);
 3478   Assembler::decl(dst);
 3479   Assembler::kmovdl(k1, dst);
 3480   Assembler::movl(dst, src);
 3481 }
 3482 
 3483 void MacroAssembler::restorevectmask() {
 3484   Assembler::knotwl(k1, k0);
 3485 }
 3486 
 3487 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
 3488   if (reachable(src)) {
 3489     if (UseXmmLoadAndClearUpper) {
 3490       movsd (dst, as_Address(src));
 3491     } else {
 3492       movlpd(dst, as_Address(src));
 3493     }
 3494   } else {
 3495     lea(rscratch1, src);
 3496     if (UseXmmLoadAndClearUpper) {
 3497       movsd (dst, Address(rscratch1, 0));
 3498     } else {
 3499       movlpd(dst, Address(rscratch1, 0));
 3500     }
 3501   }
 3502 }
 3503 
 3504 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
 3505   if (reachable(src)) {
 3506     movss(dst, as_Address(src));
 3507   } else {
 3508     lea(rscratch1, src);
 3509     movss(dst, Address(rscratch1, 0));
 3510   }
 3511 }
 3512 
 3513 void MacroAssembler::movptr(Register dst, Register src) {
 3514   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
 3515 }
 3516 
 3517 void MacroAssembler::movptr(Register dst, Address src) {
 3518   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
 3519 }
 3520 
 3521 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 3522 void MacroAssembler::movptr(Register dst, intptr_t src) {
 3523   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
 3524 }
 3525 
 3526 void MacroAssembler::movptr(Address dst, Register src) {
 3527   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
 3528 }
 3529 
 3530 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
 3531   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
 3532     Assembler::vextractf32x4(dst, src, 0);
 3533   } else {
 3534     Assembler::movdqu(dst, src);
 3535   }
 3536 }
 3537 
 3538 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
 3539   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
 3540     Assembler::vinsertf32x4(dst, dst, src, 0);
 3541   } else {
 3542     Assembler::movdqu(dst, src);
 3543   }
 3544 }
 3545 
 3546 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
 3547   if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 3548     Assembler::evmovdqul(dst, src, Assembler::AVX_512bit);
 3549   } else {
 3550     Assembler::movdqu(dst, src);
 3551   }
 3552 }
 3553 
 3554 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
 3555   if (reachable(src)) {
 3556     movdqu(dst, as_Address(src));
 3557   } else {
 3558     lea(scratchReg, src);
 3559     movdqu(dst, Address(scratchReg, 0));
 3560   }
 3561 }
 3562 
 3563 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
 3564   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
 3565     vextractf64x4_low(dst, src);
 3566   } else {
 3567     Assembler::vmovdqu(dst, src);
 3568   }
 3569 }
 3570 
 3571 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
 3572   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
 3573     vinsertf64x4_low(dst, src);
 3574   } else {
 3575     Assembler::vmovdqu(dst, src);
 3576   }
 3577 }
 3578 
 3579 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
 3580   if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 3581     Assembler::evmovdqul(dst, src, Assembler::AVX_512bit);
 3582   }
 3583   else {
 3584     Assembler::vmovdqu(dst, src);
 3585   }
 3586 }
 3587 
 3588 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src) {
 3589   if (reachable(src)) {
 3590     vmovdqu(dst, as_Address(src));
 3591   }
 3592   else {
 3593     lea(rscratch1, src);
 3594     vmovdqu(dst, Address(rscratch1, 0));
 3595   }
 3596 }
 3597 
 3598 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
 3599   if (reachable(src)) {
 3600     Assembler::movdqa(dst, as_Address(src));
 3601   } else {
 3602     lea(rscratch1, src);
 3603     Assembler::movdqa(dst, Address(rscratch1, 0));
 3604   }
 3605 }
 3606 
 3607 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
 3608   if (reachable(src)) {
 3609     Assembler::movsd(dst, as_Address(src));
 3610   } else {
 3611     lea(rscratch1, src);
 3612     Assembler::movsd(dst, Address(rscratch1, 0));
 3613   }
 3614 }
 3615 
 3616 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
 3617   if (reachable(src)) {
 3618     Assembler::movss(dst, as_Address(src));
 3619   } else {
 3620     lea(rscratch1, src);
 3621     Assembler::movss(dst, Address(rscratch1, 0));
 3622   }
 3623 }
 3624 
 3625 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
 3626   if (reachable(src)) {
 3627     Assembler::mulsd(dst, as_Address(src));
 3628   } else {
 3629     lea(rscratch1, src);
 3630     Assembler::mulsd(dst, Address(rscratch1, 0));
 3631   }
 3632 }
 3633 
 3634 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
 3635   if (reachable(src)) {
 3636     Assembler::mulss(dst, as_Address(src));
 3637   } else {
 3638     lea(rscratch1, src);
 3639     Assembler::mulss(dst, Address(rscratch1, 0));
 3640   }
 3641 }
 3642 
 3643 void MacroAssembler::null_check(Register reg, int offset) {
 3644   if (needs_explicit_null_check(offset)) {
 3645     // provoke OS NULL exception if reg = NULL by
 3646     // accessing M[reg] w/o changing any (non-CC) registers
 3647     // NOTE: cmpl is plenty here to provoke a segv
 3648     cmpptr(rax, Address(reg, 0));
 3649     // Note: should probably use testl(rax, Address(reg, 0));
 3650     //       may be shorter code (however, this version of
 3651     //       testl needs to be implemented first)
 3652   } else {
 3653     // nothing to do, (later) access of M[reg + offset]
 3654     // will provoke OS NULL exception if reg = NULL
 3655   }
 3656 }
 3657 
 3658 void MacroAssembler::os_breakpoint() {
 3659   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
 3660   // (e.g., MSVC can't call ps() otherwise)
 3661   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
 3662 }
 3663 
 3664 void MacroAssembler::unimplemented(const char* what) {
 3665   const char* buf = NULL;
 3666   {
 3667     ResourceMark rm;
 3668     stringStream ss;
 3669     ss.print("unimplemented: %s", what);
 3670     buf = code_string(ss.as_string());
 3671   }
 3672   stop(buf);
 3673 }
 3674 
 3675 #ifdef _LP64
 3676 #define XSTATE_BV 0x200
 3677 #endif
 3678 
 3679 void MacroAssembler::pop_CPU_state() {
 3680   pop_FPU_state();
 3681   pop_IU_state();
 3682 }
 3683 
 3684 void MacroAssembler::pop_FPU_state() {
 3685 #ifndef _LP64
 3686   frstor(Address(rsp, 0));
 3687 #else
 3688   fxrstor(Address(rsp, 0));
 3689 #endif
 3690   addptr(rsp, FPUStateSizeInWords * wordSize);
 3691 }
 3692 
 3693 void MacroAssembler::pop_IU_state() {
 3694   popa();
 3695   LP64_ONLY(addq(rsp, 8));
 3696   popf();
 3697 }
 3698 
 3699 // Save Integer and Float state
 3700 // Warning: Stack must be 16 byte aligned (64bit)
 3701 void MacroAssembler::push_CPU_state() {
 3702   push_IU_state();
 3703   push_FPU_state();
 3704 }
 3705 
 3706 void MacroAssembler::push_FPU_state() {
 3707   subptr(rsp, FPUStateSizeInWords * wordSize);
 3708 #ifndef _LP64
 3709   fnsave(Address(rsp, 0));
 3710   fwait();
 3711 #else
 3712   fxsave(Address(rsp, 0));
 3713 #endif // LP64
 3714 }
 3715 
 3716 void MacroAssembler::push_IU_state() {
 3717   // Push flags first because pusha kills them
 3718   pushf();
 3719   // Make sure rsp stays 16-byte aligned
 3720   LP64_ONLY(subq(rsp, 8));
 3721   pusha();
 3722 }
 3723 
 3724 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register
 3725   if (!java_thread->is_valid()) {
 3726     java_thread = rdi;
 3727     get_thread(java_thread);
 3728   }
 3729   // we must set sp to zero to clear frame
 3730   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
 3731   if (clear_fp) {
 3732     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
 3733   }
 3734 
 3735   // Always clear the pc because it could have been set by make_walkable()
 3736   movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
 3737 
 3738   vzeroupper();
 3739 }
 3740 
 3741 void MacroAssembler::restore_rax(Register tmp) {
 3742   if (tmp == noreg) pop(rax);
 3743   else if (tmp != rax) mov(rax, tmp);
 3744 }
 3745 
 3746 void MacroAssembler::round_to(Register reg, int modulus) {
 3747   addptr(reg, modulus - 1);
 3748   andptr(reg, -modulus);
 3749 }
 3750 
 3751 void MacroAssembler::save_rax(Register tmp) {
 3752   if (tmp == noreg) push(rax);
 3753   else if (tmp != rax) mov(tmp, rax);
 3754 }
 3755 
 3756 // Write serialization page so VM thread can do a pseudo remote membar.
 3757 // We use the current thread pointer to calculate a thread specific
 3758 // offset to write to within the page. This minimizes bus traffic
 3759 // due to cache line collision.
 3760 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
 3761   movl(tmp, thread);
 3762   shrl(tmp, os::get_serialize_page_shift_count());
 3763   andl(tmp, (os::vm_page_size() - sizeof(int)));
 3764 
 3765   Address index(noreg, tmp, Address::times_1);
 3766   ExternalAddress page(os::get_memory_serialize_page());
 3767 
 3768   // Size of store must match masking code above
 3769   movl(as_Address(ArrayAddress(page, index)), tmp);
 3770 }
 3771 
 3772 void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, Register temp_reg) {
 3773   if (SafepointMechanism::uses_thread_local_poll()) {
 3774 #ifdef _LP64
 3775     assert(thread_reg == r15_thread, "should be");
 3776 #else
 3777     if (thread_reg == noreg) {
 3778       thread_reg = temp_reg;
 3779       get_thread(thread_reg);
 3780     }
 3781 #endif
 3782     testb(Address(thread_reg, Thread::polling_page_offset()), SafepointMechanism::poll_bit());
 3783     jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
 3784   } else {
 3785     cmp32(ExternalAddress(SafepointSynchronize::address_of_state()),
 3786         SafepointSynchronize::_not_synchronized);
 3787     jcc(Assembler::notEqual, slow_path);
 3788   }
 3789 }
 3790 
 3791 // Calls to C land
 3792 //
 3793 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
 3794 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 3795 // has to be reset to 0. This is required to allow proper stack traversal.
 3796 void MacroAssembler::set_last_Java_frame(Register java_thread,
 3797                                          Register last_java_sp,
 3798                                          Register last_java_fp,
 3799                                          address  last_java_pc) {
 3800   vzeroupper();
 3801   // determine java_thread register
 3802   if (!java_thread->is_valid()) {
 3803     java_thread = rdi;
 3804     get_thread(java_thread);
 3805   }
 3806   // determine last_java_sp register
 3807   if (!last_java_sp->is_valid()) {
 3808     last_java_sp = rsp;
 3809   }
 3810 
 3811   // last_java_fp is optional
 3812 
 3813   if (last_java_fp->is_valid()) {
 3814     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
 3815   }
 3816 
 3817   // last_java_pc is optional
 3818 
 3819   if (last_java_pc != NULL) {
 3820     lea(Address(java_thread,
 3821                  JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
 3822         InternalAddress(last_java_pc));
 3823 
 3824   }
 3825   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
 3826 }
 3827 
 3828 void MacroAssembler::shlptr(Register dst, int imm8) {
 3829   LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
 3830 }
 3831 
 3832 void MacroAssembler::shrptr(Register dst, int imm8) {
 3833   LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
 3834 }
 3835 
 3836 void MacroAssembler::sign_extend_byte(Register reg) {
 3837   if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
 3838     movsbl(reg, reg); // movsxb
 3839   } else {
 3840     shll(reg, 24);
 3841     sarl(reg, 24);
 3842   }
 3843 }
 3844 
 3845 void MacroAssembler::sign_extend_short(Register reg) {
 3846   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
 3847     movswl(reg, reg); // movsxw
 3848   } else {
 3849     shll(reg, 16);
 3850     sarl(reg, 16);
 3851   }
 3852 }
 3853 
 3854 void MacroAssembler::testl(Register dst, AddressLiteral src) {
 3855   assert(reachable(src), "Address should be reachable");
 3856   testl(dst, as_Address(src));
 3857 }
 3858 
 3859 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
 3860   int dst_enc = dst->encoding();
 3861   int src_enc = src->encoding();
 3862   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
 3863     Assembler::pcmpeqb(dst, src);
 3864   } else if ((dst_enc < 16) && (src_enc < 16)) {
 3865     Assembler::pcmpeqb(dst, src);
 3866   } else if (src_enc < 16) {
 3867     subptr(rsp, 64);
 3868     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 3869     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 3870     Assembler::pcmpeqb(xmm0, src);
 3871     movdqu(dst, xmm0);
 3872     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 3873     addptr(rsp, 64);
 3874   } else if (dst_enc < 16) {
 3875     subptr(rsp, 64);
 3876     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 3877     evmovdqul(xmm0, src, Assembler::AVX_512bit);
 3878     Assembler::pcmpeqb(dst, xmm0);
 3879     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 3880     addptr(rsp, 64);
 3881   } else {
 3882     subptr(rsp, 64);
 3883     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 3884     subptr(rsp, 64);
 3885     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
 3886     movdqu(xmm0, src);
 3887     movdqu(xmm1, dst);
 3888     Assembler::pcmpeqb(xmm1, xmm0);
 3889     movdqu(dst, xmm1);
 3890     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
 3891     addptr(rsp, 64);
 3892     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 3893     addptr(rsp, 64);
 3894   }
 3895 }
 3896 
 3897 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
 3898   int dst_enc = dst->encoding();
 3899   int src_enc = src->encoding();
 3900   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
 3901     Assembler::pcmpeqw(dst, src);
 3902   } else if ((dst_enc < 16) && (src_enc < 16)) {
 3903     Assembler::pcmpeqw(dst, src);
 3904   } else if (src_enc < 16) {
 3905     subptr(rsp, 64);
 3906     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 3907     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 3908     Assembler::pcmpeqw(xmm0, src);
 3909     movdqu(dst, xmm0);
 3910     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 3911     addptr(rsp, 64);
 3912   } else if (dst_enc < 16) {
 3913     subptr(rsp, 64);
 3914     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 3915     evmovdqul(xmm0, src, Assembler::AVX_512bit);
 3916     Assembler::pcmpeqw(dst, xmm0);
 3917     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 3918     addptr(rsp, 64);
 3919   } else {
 3920     subptr(rsp, 64);
 3921     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 3922     subptr(rsp, 64);
 3923     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
 3924     movdqu(xmm0, src);
 3925     movdqu(xmm1, dst);
 3926     Assembler::pcmpeqw(xmm1, xmm0);
 3927     movdqu(dst, xmm1);
 3928     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
 3929     addptr(rsp, 64);
 3930     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 3931     addptr(rsp, 64);
 3932   }
 3933 }
 3934 
 3935 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
 3936   int dst_enc = dst->encoding();
 3937   if (dst_enc < 16) {
 3938     Assembler::pcmpestri(dst, src, imm8);
 3939   } else {
 3940     subptr(rsp, 64);
 3941     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 3942     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 3943     Assembler::pcmpestri(xmm0, src, imm8);
 3944     movdqu(dst, xmm0);
 3945     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 3946     addptr(rsp, 64);
 3947   }
 3948 }
 3949 
 3950 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
 3951   int dst_enc = dst->encoding();
 3952   int src_enc = src->encoding();
 3953   if ((dst_enc < 16) && (src_enc < 16)) {
 3954     Assembler::pcmpestri(dst, src, imm8);
 3955   } else if (src_enc < 16) {
 3956     subptr(rsp, 64);
 3957     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 3958     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 3959     Assembler::pcmpestri(xmm0, src, imm8);
 3960     movdqu(dst, xmm0);
 3961     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 3962     addptr(rsp, 64);
 3963   } else if (dst_enc < 16) {
 3964     subptr(rsp, 64);
 3965     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 3966     evmovdqul(xmm0, src, Assembler::AVX_512bit);
 3967     Assembler::pcmpestri(dst, xmm0, imm8);
 3968     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 3969     addptr(rsp, 64);
 3970   } else {
 3971     subptr(rsp, 64);
 3972     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 3973     subptr(rsp, 64);
 3974     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
 3975     movdqu(xmm0, src);
 3976     movdqu(xmm1, dst);
 3977     Assembler::pcmpestri(xmm1, xmm0, imm8);
 3978     movdqu(dst, xmm1);
 3979     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
 3980     addptr(rsp, 64);
 3981     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 3982     addptr(rsp, 64);
 3983   }
 3984 }
 3985 
 3986 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
 3987   int dst_enc = dst->encoding();
 3988   int src_enc = src->encoding();
 3989   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
 3990     Assembler::pmovzxbw(dst, src);
 3991   } else if ((dst_enc < 16) && (src_enc < 16)) {
 3992     Assembler::pmovzxbw(dst, src);
 3993   } else if (src_enc < 16) {
 3994     subptr(rsp, 64);
 3995     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 3996     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 3997     Assembler::pmovzxbw(xmm0, src);
 3998     movdqu(dst, xmm0);
 3999     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 4000     addptr(rsp, 64);
 4001   } else if (dst_enc < 16) {
 4002     subptr(rsp, 64);
 4003     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 4004     evmovdqul(xmm0, src, Assembler::AVX_512bit);
 4005     Assembler::pmovzxbw(dst, xmm0);
 4006     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 4007     addptr(rsp, 64);
 4008   } else {
 4009     subptr(rsp, 64);
 4010     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 4011     subptr(rsp, 64);
 4012     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
 4013     movdqu(xmm0, src);
 4014     movdqu(xmm1, dst);
 4015     Assembler::pmovzxbw(xmm1, xmm0);
 4016     movdqu(dst, xmm1);
 4017     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
 4018     addptr(rsp, 64);
 4019     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 4020     addptr(rsp, 64);
 4021   }
 4022 }
 4023 
 4024 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
 4025   int dst_enc = dst->encoding();
 4026   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
 4027     Assembler::pmovzxbw(dst, src);
 4028   } else if (dst_enc < 16) {
 4029     Assembler::pmovzxbw(dst, src);
 4030   } else {
 4031     subptr(rsp, 64);
 4032     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 4033     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4034     Assembler::pmovzxbw(xmm0, src);
 4035     movdqu(dst, xmm0);
 4036     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 4037     addptr(rsp, 64);
 4038   }
 4039 }
 4040 
 4041 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
 4042   int src_enc = src->encoding();
 4043   if (src_enc < 16) {
 4044     Assembler::pmovmskb(dst, src);
 4045   } else {
 4046     subptr(rsp, 64);
 4047     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 4048     evmovdqul(xmm0, src, Assembler::AVX_512bit);
 4049     Assembler::pmovmskb(dst, xmm0);
 4050     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 4051     addptr(rsp, 64);
 4052   }
 4053 }
 4054 
 4055 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
 4056   int dst_enc = dst->encoding();
 4057   int src_enc = src->encoding();
 4058   if ((dst_enc < 16) && (src_enc < 16)) {
 4059     Assembler::ptest(dst, src);
 4060   } else if (src_enc < 16) {
 4061     subptr(rsp, 64);
 4062     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 4063     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4064     Assembler::ptest(xmm0, src);
 4065     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 4066     addptr(rsp, 64);
 4067   } else if (dst_enc < 16) {
 4068     subptr(rsp, 64);
 4069     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 4070     evmovdqul(xmm0, src, Assembler::AVX_512bit);
 4071     Assembler::ptest(dst, xmm0);
 4072     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 4073     addptr(rsp, 64);
 4074   } else {
 4075     subptr(rsp, 64);
 4076     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 4077     subptr(rsp, 64);
 4078     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
 4079     movdqu(xmm0, src);
 4080     movdqu(xmm1, dst);
 4081     Assembler::ptest(xmm1, xmm0);
 4082     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
 4083     addptr(rsp, 64);
 4084     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 4085     addptr(rsp, 64);
 4086   }
 4087 }
 4088 
 4089 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
 4090   if (reachable(src)) {
 4091     Assembler::sqrtsd(dst, as_Address(src));
 4092   } else {
 4093     lea(rscratch1, src);
 4094     Assembler::sqrtsd(dst, Address(rscratch1, 0));
 4095   }
 4096 }
 4097 
 4098 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
 4099   if (reachable(src)) {
 4100     Assembler::sqrtss(dst, as_Address(src));
 4101   } else {
 4102     lea(rscratch1, src);
 4103     Assembler::sqrtss(dst, Address(rscratch1, 0));
 4104   }
 4105 }
 4106 
 4107 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
 4108   if (reachable(src)) {
 4109     Assembler::subsd(dst, as_Address(src));
 4110   } else {
 4111     lea(rscratch1, src);
 4112     Assembler::subsd(dst, Address(rscratch1, 0));
 4113   }
 4114 }
 4115 
 4116 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
 4117   if (reachable(src)) {
 4118     Assembler::subss(dst, as_Address(src));
 4119   } else {
 4120     lea(rscratch1, src);
 4121     Assembler::subss(dst, Address(rscratch1, 0));
 4122   }
 4123 }
 4124 
 4125 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
 4126   if (reachable(src)) {
 4127     Assembler::ucomisd(dst, as_Address(src));
 4128   } else {
 4129     lea(rscratch1, src);
 4130     Assembler::ucomisd(dst, Address(rscratch1, 0));
 4131   }
 4132 }
 4133 
 4134 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
 4135   if (reachable(src)) {
 4136     Assembler::ucomiss(dst, as_Address(src));
 4137   } else {
 4138     lea(rscratch1, src);
 4139     Assembler::ucomiss(dst, Address(rscratch1, 0));
 4140   }
 4141 }
 4142 
 4143 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
 4144   // Used in sign-bit flipping with aligned address.
 4145   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
 4146   if (reachable(src)) {
 4147     Assembler::xorpd(dst, as_Address(src));
 4148   } else {
 4149     lea(rscratch1, src);
 4150     Assembler::xorpd(dst, Address(rscratch1, 0));
 4151   }
 4152 }
 4153 
 4154 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
 4155   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
 4156     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
 4157   }
 4158   else {
 4159     Assembler::xorpd(dst, src);
 4160   }
 4161 }
 4162 
 4163 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
 4164   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
 4165     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
 4166   } else {
 4167     Assembler::xorps(dst, src);
 4168   }
 4169 }
 4170 
 4171 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
 4172   // Used in sign-bit flipping with aligned address.
 4173   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
 4174   if (reachable(src)) {
 4175     Assembler::xorps(dst, as_Address(src));
 4176   } else {
 4177     lea(rscratch1, src);
 4178     Assembler::xorps(dst, Address(rscratch1, 0));
 4179   }
 4180 }
 4181 
 4182 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
 4183   // Used in sign-bit flipping with aligned address.
 4184   bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
 4185   assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
 4186   if (reachable(src)) {
 4187     Assembler::pshufb(dst, as_Address(src));
 4188   } else {
 4189     lea(rscratch1, src);
 4190     Assembler::pshufb(dst, Address(rscratch1, 0));
 4191   }
 4192 }
 4193 
 4194 // AVX 3-operands instructions
 4195 
 4196 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
 4197   if (reachable(src)) {
 4198     vaddsd(dst, nds, as_Address(src));
 4199   } else {
 4200     lea(rscratch1, src);
 4201     vaddsd(dst, nds, Address(rscratch1, 0));
 4202   }
 4203 }
 4204 
 4205 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
 4206   if (reachable(src)) {
 4207     vaddss(dst, nds, as_Address(src));
 4208   } else {
 4209     lea(rscratch1, src);
 4210     vaddss(dst, nds, Address(rscratch1, 0));
 4211   }
 4212 }
 4213 
 4214 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
 4215   int dst_enc = dst->encoding();
 4216   int nds_enc = nds->encoding();
 4217   int src_enc = src->encoding();
 4218   if ((dst_enc < 16) && (nds_enc < 16)) {
 4219     vandps(dst, nds, negate_field, vector_len);
 4220   } else if ((src_enc < 16) && (dst_enc < 16)) {
 4221     evmovdqul(src, nds, Assembler::AVX_512bit);
 4222     vandps(dst, src, negate_field, vector_len);
 4223   } else if (src_enc < 16) {
 4224     evmovdqul(src, nds, Assembler::AVX_512bit);
 4225     vandps(src, src, negate_field, vector_len);
 4226     evmovdqul(dst, src, Assembler::AVX_512bit);
 4227   } else if (dst_enc < 16) {
 4228     evmovdqul(src, xmm0, Assembler::AVX_512bit);
 4229     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4230     vandps(dst, xmm0, negate_field, vector_len);
 4231     evmovdqul(xmm0, src, Assembler::AVX_512bit);
 4232   } else {
 4233     if (src_enc != dst_enc) {
 4234       evmovdqul(src, xmm0, Assembler::AVX_512bit);
 4235       evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4236       vandps(xmm0, xmm0, negate_field, vector_len);
 4237       evmovdqul(dst, xmm0, Assembler::AVX_512bit);
 4238       evmovdqul(xmm0, src, Assembler::AVX_512bit);
 4239     } else {
 4240       subptr(rsp, 64);
 4241       evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 4242       evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4243       vandps(xmm0, xmm0, negate_field, vector_len);
 4244       evmovdqul(dst, xmm0, Assembler::AVX_512bit);
 4245       evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 4246       addptr(rsp, 64);
 4247     }
 4248   }
 4249 }
 4250 
 4251 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
 4252   int dst_enc = dst->encoding();
 4253   int nds_enc = nds->encoding();
 4254   int src_enc = src->encoding();
 4255   if ((dst_enc < 16) && (nds_enc < 16)) {
 4256     vandpd(dst, nds, negate_field, vector_len);
 4257   } else if ((src_enc < 16) && (dst_enc < 16)) {
 4258     evmovdqul(src, nds, Assembler::AVX_512bit);
 4259     vandpd(dst, src, negate_field, vector_len);
 4260   } else if (src_enc < 16) {
 4261     evmovdqul(src, nds, Assembler::AVX_512bit);
 4262     vandpd(src, src, negate_field, vector_len);
 4263     evmovdqul(dst, src, Assembler::AVX_512bit);
 4264   } else if (dst_enc < 16) {
 4265     evmovdqul(src, xmm0, Assembler::AVX_512bit);
 4266     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4267     vandpd(dst, xmm0, negate_field, vector_len);
 4268     evmovdqul(xmm0, src, Assembler::AVX_512bit);
 4269   } else {
 4270     if (src_enc != dst_enc) {
 4271       evmovdqul(src, xmm0, Assembler::AVX_512bit);
 4272       evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4273       vandpd(xmm0, xmm0, negate_field, vector_len);
 4274       evmovdqul(dst, xmm0, Assembler::AVX_512bit);
 4275       evmovdqul(xmm0, src, Assembler::AVX_512bit);
 4276     } else {
 4277       subptr(rsp, 64);
 4278       evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 4279       evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4280       vandpd(xmm0, xmm0, negate_field, vector_len);
 4281       evmovdqul(dst, xmm0, Assembler::AVX_512bit);
 4282       evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 4283       addptr(rsp, 64);
 4284     }
 4285   }
 4286 }
 4287 
 4288 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
 4289   int dst_enc = dst->encoding();
 4290   int nds_enc = nds->encoding();
 4291   int src_enc = src->encoding();
 4292   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
 4293     Assembler::vpaddb(dst, nds, src, vector_len);
 4294   } else if ((dst_enc < 16) && (src_enc < 16)) {
 4295     Assembler::vpaddb(dst, dst, src, vector_len);
 4296   } else if ((dst_enc < 16) && (nds_enc < 16)) {
 4297     // use nds as scratch for src
 4298     evmovdqul(nds, src, Assembler::AVX_512bit);
 4299     Assembler::vpaddb(dst, dst, nds, vector_len);
 4300   } else if ((src_enc < 16) && (nds_enc < 16)) {
 4301     // use nds as scratch for dst
 4302     evmovdqul(nds, dst, Assembler::AVX_512bit);
 4303     Assembler::vpaddb(nds, nds, src, vector_len);
 4304     evmovdqul(dst, nds, Assembler::AVX_512bit);
 4305   } else if (dst_enc < 16) {
 4306     // use nds as scatch for xmm0 to hold src
 4307     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
 4308     evmovdqul(xmm0, src, Assembler::AVX_512bit);
 4309     Assembler::vpaddb(dst, dst, xmm0, vector_len);
 4310     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4311   } else {
 4312     // worse case scenario, all regs are in the upper bank
 4313     subptr(rsp, 64);
 4314     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
 4315     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
 4316     evmovdqul(xmm1, src, Assembler::AVX_512bit);
 4317     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4318     Assembler::vpaddb(xmm0, xmm0, xmm1, vector_len);
 4319     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
 4320     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4321     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
 4322     addptr(rsp, 64);
 4323   }
 4324 }
 4325 
 4326 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
 4327   int dst_enc = dst->encoding();
 4328   int nds_enc = nds->encoding();
 4329   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
 4330     Assembler::vpaddb(dst, nds, src, vector_len);
 4331   } else if (dst_enc < 16) {
 4332     Assembler::vpaddb(dst, dst, src, vector_len);
 4333   } else if (nds_enc < 16) {
 4334     // implies dst_enc in upper bank with src as scratch
 4335     evmovdqul(nds, dst, Assembler::AVX_512bit);
 4336     Assembler::vpaddb(nds, nds, src, vector_len);
 4337     evmovdqul(dst, nds, Assembler::AVX_512bit);
 4338   } else {
 4339     // worse case scenario, all regs in upper bank
 4340     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
 4341     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4342     Assembler::vpaddb(xmm0, xmm0, src, vector_len);
 4343     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4344   }
 4345 }
 4346 
 4347 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
 4348   int dst_enc = dst->encoding();
 4349   int nds_enc = nds->encoding();
 4350   int src_enc = src->encoding();
 4351   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
 4352     Assembler::vpaddw(dst, nds, src, vector_len);
 4353   } else if ((dst_enc < 16) && (src_enc < 16)) {
 4354     Assembler::vpaddw(dst, dst, src, vector_len);
 4355   } else if ((dst_enc < 16) && (nds_enc < 16)) {
 4356     // use nds as scratch for src
 4357     evmovdqul(nds, src, Assembler::AVX_512bit);
 4358     Assembler::vpaddw(dst, dst, nds, vector_len);
 4359   } else if ((src_enc < 16) && (nds_enc < 16)) {
 4360     // use nds as scratch for dst
 4361     evmovdqul(nds, dst, Assembler::AVX_512bit);
 4362     Assembler::vpaddw(nds, nds, src, vector_len);
 4363     evmovdqul(dst, nds, Assembler::AVX_512bit);
 4364   } else if (dst_enc < 16) {
 4365     // use nds as scatch for xmm0 to hold src
 4366     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
 4367     evmovdqul(xmm0, src, Assembler::AVX_512bit);
 4368     Assembler::vpaddw(dst, dst, xmm0, vector_len);
 4369     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4370   } else {
 4371     // worse case scenario, all regs are in the upper bank
 4372     subptr(rsp, 64);
 4373     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
 4374     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
 4375     evmovdqul(xmm1, src, Assembler::AVX_512bit);
 4376     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4377     Assembler::vpaddw(xmm0, xmm0, xmm1, vector_len);
 4378     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
 4379     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4380     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
 4381     addptr(rsp, 64);
 4382   }
 4383 }
 4384 
 4385 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
 4386   int dst_enc = dst->encoding();
 4387   int nds_enc = nds->encoding();
 4388   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
 4389     Assembler::vpaddw(dst, nds, src, vector_len);
 4390   } else if (dst_enc < 16) {
 4391     Assembler::vpaddw(dst, dst, src, vector_len);
 4392   } else if (nds_enc < 16) {
 4393     // implies dst_enc in upper bank with src as scratch
 4394     evmovdqul(nds, dst, Assembler::AVX_512bit);
 4395     Assembler::vpaddw(nds, nds, src, vector_len);
 4396     evmovdqul(dst, nds, Assembler::AVX_512bit);
 4397   } else {
 4398     // worse case scenario, all regs in upper bank
 4399     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
 4400     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4401     Assembler::vpaddw(xmm0, xmm0, src, vector_len);
 4402     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4403   }
 4404 }
 4405 
 4406 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
 4407   if (reachable(src)) {
 4408     Assembler::vpand(dst, nds, as_Address(src), vector_len);
 4409   } else {
 4410     lea(rscratch1, src);
 4411     Assembler::vpand(dst, nds, Address(rscratch1, 0), vector_len);
 4412   }
 4413 }
 4414 
 4415 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src) {
 4416   int dst_enc = dst->encoding();
 4417   int src_enc = src->encoding();
 4418   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
 4419     Assembler::vpbroadcastw(dst, src);
 4420   } else if ((dst_enc < 16) && (src_enc < 16)) {
 4421     Assembler::vpbroadcastw(dst, src);
 4422   } else if (src_enc < 16) {
 4423     subptr(rsp, 64);
 4424     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 4425     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4426     Assembler::vpbroadcastw(xmm0, src);
 4427     movdqu(dst, xmm0);
 4428     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 4429     addptr(rsp, 64);
 4430   } else if (dst_enc < 16) {
 4431     subptr(rsp, 64);
 4432     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 4433     evmovdqul(xmm0, src, Assembler::AVX_512bit);
 4434     Assembler::vpbroadcastw(dst, xmm0);
 4435     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 4436     addptr(rsp, 64);
 4437   } else {
 4438     subptr(rsp, 64);
 4439     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 4440     subptr(rsp, 64);
 4441     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
 4442     movdqu(xmm0, src);
 4443     movdqu(xmm1, dst);
 4444     Assembler::vpbroadcastw(xmm1, xmm0);
 4445     movdqu(dst, xmm1);
 4446     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
 4447     addptr(rsp, 64);
 4448     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 4449     addptr(rsp, 64);
 4450   }
 4451 }
 4452 
 4453 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
 4454   int dst_enc = dst->encoding();
 4455   int nds_enc = nds->encoding();
 4456   int src_enc = src->encoding();
 4457   assert(dst_enc == nds_enc, "");
 4458   if ((dst_enc < 16) && (src_enc < 16)) {
 4459     Assembler::vpcmpeqb(dst, nds, src, vector_len);
 4460   } else if (src_enc < 16) {
 4461     subptr(rsp, 64);
 4462     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 4463     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4464     Assembler::vpcmpeqb(xmm0, xmm0, src, vector_len);
 4465     movdqu(dst, xmm0);
 4466     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 4467     addptr(rsp, 64);
 4468   } else if (dst_enc < 16) {
 4469     subptr(rsp, 64);
 4470     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 4471     evmovdqul(xmm0, src, Assembler::AVX_512bit);
 4472     Assembler::vpcmpeqb(dst, dst, xmm0, vector_len);
 4473     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 4474     addptr(rsp, 64);
 4475   } else {
 4476     subptr(rsp, 64);
 4477     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 4478     subptr(rsp, 64);
 4479     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
 4480     movdqu(xmm0, src);
 4481     movdqu(xmm1, dst);
 4482     Assembler::vpcmpeqb(xmm1, xmm1, xmm0, vector_len);
 4483     movdqu(dst, xmm1);
 4484     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
 4485     addptr(rsp, 64);
 4486     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 4487     addptr(rsp, 64);
 4488   }
 4489 }
 4490 
 4491 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
 4492   int dst_enc = dst->encoding();
 4493   int nds_enc = nds->encoding();
 4494   int src_enc = src->encoding();
 4495   assert(dst_enc == nds_enc, "");
 4496   if ((dst_enc < 16) && (src_enc < 16)) {
 4497     Assembler::vpcmpeqw(dst, nds, src, vector_len);
 4498   } else if (src_enc < 16) {
 4499     subptr(rsp, 64);
 4500     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 4501     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4502     Assembler::vpcmpeqw(xmm0, xmm0, src, vector_len);
 4503     movdqu(dst, xmm0);
 4504     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 4505     addptr(rsp, 64);
 4506   } else if (dst_enc < 16) {
 4507     subptr(rsp, 64);
 4508     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 4509     evmovdqul(xmm0, src, Assembler::AVX_512bit);
 4510     Assembler::vpcmpeqw(dst, dst, xmm0, vector_len);
 4511     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 4512     addptr(rsp, 64);
 4513   } else {
 4514     subptr(rsp, 64);
 4515     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 4516     subptr(rsp, 64);
 4517     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
 4518     movdqu(xmm0, src);
 4519     movdqu(xmm1, dst);
 4520     Assembler::vpcmpeqw(xmm1, xmm1, xmm0, vector_len);
 4521     movdqu(dst, xmm1);
 4522     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
 4523     addptr(rsp, 64);
 4524     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 4525     addptr(rsp, 64);
 4526   }
 4527 }
 4528 
 4529 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
 4530   int dst_enc = dst->encoding();
 4531   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
 4532     Assembler::vpmovzxbw(dst, src, vector_len);
 4533   } else if (dst_enc < 16) {
 4534     Assembler::vpmovzxbw(dst, src, vector_len);
 4535   } else {
 4536     subptr(rsp, 64);
 4537     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 4538     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4539     Assembler::vpmovzxbw(xmm0, src, vector_len);
 4540     movdqu(dst, xmm0);
 4541     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 4542     addptr(rsp, 64);
 4543   }
 4544 }
 4545 
 4546 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src) {
 4547   int src_enc = src->encoding();
 4548   if (src_enc < 16) {
 4549     Assembler::vpmovmskb(dst, src);
 4550   } else {
 4551     subptr(rsp, 64);
 4552     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 4553     evmovdqul(xmm0, src, Assembler::AVX_512bit);
 4554     Assembler::vpmovmskb(dst, xmm0);
 4555     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 4556     addptr(rsp, 64);
 4557   }
 4558 }
 4559 
 4560 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
 4561   int dst_enc = dst->encoding();
 4562   int nds_enc = nds->encoding();
 4563   int src_enc = src->encoding();
 4564   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
 4565     Assembler::vpmullw(dst, nds, src, vector_len);
 4566   } else if ((dst_enc < 16) && (src_enc < 16)) {
 4567     Assembler::vpmullw(dst, dst, src, vector_len);
 4568   } else if ((dst_enc < 16) && (nds_enc < 16)) {
 4569     // use nds as scratch for src
 4570     evmovdqul(nds, src, Assembler::AVX_512bit);
 4571     Assembler::vpmullw(dst, dst, nds, vector_len);
 4572   } else if ((src_enc < 16) && (nds_enc < 16)) {
 4573     // use nds as scratch for dst
 4574     evmovdqul(nds, dst, Assembler::AVX_512bit);
 4575     Assembler::vpmullw(nds, nds, src, vector_len);
 4576     evmovdqul(dst, nds, Assembler::AVX_512bit);
 4577   } else if (dst_enc < 16) {
 4578     // use nds as scatch for xmm0 to hold src
 4579     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
 4580     evmovdqul(xmm0, src, Assembler::AVX_512bit);
 4581     Assembler::vpmullw(dst, dst, xmm0, vector_len);
 4582     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4583   } else {
 4584     // worse case scenario, all regs are in the upper bank
 4585     subptr(rsp, 64);
 4586     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
 4587     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
 4588     evmovdqul(xmm1, src, Assembler::AVX_512bit);
 4589     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4590     Assembler::vpmullw(xmm0, xmm0, xmm1, vector_len);
 4591     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
 4592     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4593     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
 4594     addptr(rsp, 64);
 4595   }
 4596 }
 4597 
 4598 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
 4599   int dst_enc = dst->encoding();
 4600   int nds_enc = nds->encoding();
 4601   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
 4602     Assembler::vpmullw(dst, nds, src, vector_len);
 4603   } else if (dst_enc < 16) {
 4604     Assembler::vpmullw(dst, dst, src, vector_len);
 4605   } else if (nds_enc < 16) {
 4606     // implies dst_enc in upper bank with src as scratch
 4607     evmovdqul(nds, dst, Assembler::AVX_512bit);
 4608     Assembler::vpmullw(nds, nds, src, vector_len);
 4609     evmovdqul(dst, nds, Assembler::AVX_512bit);
 4610   } else {
 4611     // worse case scenario, all regs in upper bank
 4612     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
 4613     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4614     Assembler::vpmullw(xmm0, xmm0, src, vector_len);
 4615     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4616   }
 4617 }
 4618 
 4619 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
 4620   int dst_enc = dst->encoding();
 4621   int nds_enc = nds->encoding();
 4622   int src_enc = src->encoding();
 4623   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
 4624     Assembler::vpsubb(dst, nds, src, vector_len);
 4625   } else if ((dst_enc < 16) && (src_enc < 16)) {
 4626     Assembler::vpsubb(dst, dst, src, vector_len);
 4627   } else if ((dst_enc < 16) && (nds_enc < 16)) {
 4628     // use nds as scratch for src
 4629     evmovdqul(nds, src, Assembler::AVX_512bit);
 4630     Assembler::vpsubb(dst, dst, nds, vector_len);
 4631   } else if ((src_enc < 16) && (nds_enc < 16)) {
 4632     // use nds as scratch for dst
 4633     evmovdqul(nds, dst, Assembler::AVX_512bit);
 4634     Assembler::vpsubb(nds, nds, src, vector_len);
 4635     evmovdqul(dst, nds, Assembler::AVX_512bit);
 4636   } else if (dst_enc < 16) {
 4637     // use nds as scatch for xmm0 to hold src
 4638     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
 4639     evmovdqul(xmm0, src, Assembler::AVX_512bit);
 4640     Assembler::vpsubb(dst, dst, xmm0, vector_len);
 4641     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4642   } else {
 4643     // worse case scenario, all regs are in the upper bank
 4644     subptr(rsp, 64);
 4645     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
 4646     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
 4647     evmovdqul(xmm1, src, Assembler::AVX_512bit);
 4648     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4649     Assembler::vpsubb(xmm0, xmm0, xmm1, vector_len);
 4650     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
 4651     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4652     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
 4653     addptr(rsp, 64);
 4654   }
 4655 }
 4656 
 4657 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
 4658   int dst_enc = dst->encoding();
 4659   int nds_enc = nds->encoding();
 4660   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
 4661     Assembler::vpsubb(dst, nds, src, vector_len);
 4662   } else if (dst_enc < 16) {
 4663     Assembler::vpsubb(dst, dst, src, vector_len);
 4664   } else if (nds_enc < 16) {
 4665     // implies dst_enc in upper bank with src as scratch
 4666     evmovdqul(nds, dst, Assembler::AVX_512bit);
 4667     Assembler::vpsubb(nds, nds, src, vector_len);
 4668     evmovdqul(dst, nds, Assembler::AVX_512bit);
 4669   } else {
 4670     // worse case scenario, all regs in upper bank
 4671     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
 4672     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4673     Assembler::vpsubw(xmm0, xmm0, src, vector_len);
 4674     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4675   }
 4676 }
 4677 
 4678 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
 4679   int dst_enc = dst->encoding();
 4680   int nds_enc = nds->encoding();
 4681   int src_enc = src->encoding();
 4682   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
 4683     Assembler::vpsubw(dst, nds, src, vector_len);
 4684   } else if ((dst_enc < 16) && (src_enc < 16)) {
 4685     Assembler::vpsubw(dst, dst, src, vector_len);
 4686   } else if ((dst_enc < 16) && (nds_enc < 16)) {
 4687     // use nds as scratch for src
 4688     evmovdqul(nds, src, Assembler::AVX_512bit);
 4689     Assembler::vpsubw(dst, dst, nds, vector_len);
 4690   } else if ((src_enc < 16) && (nds_enc < 16)) {
 4691     // use nds as scratch for dst
 4692     evmovdqul(nds, dst, Assembler::AVX_512bit);
 4693     Assembler::vpsubw(nds, nds, src, vector_len);
 4694     evmovdqul(dst, nds, Assembler::AVX_512bit);
 4695   } else if (dst_enc < 16) {
 4696     // use nds as scatch for xmm0 to hold src
 4697     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
 4698     evmovdqul(xmm0, src, Assembler::AVX_512bit);
 4699     Assembler::vpsubw(dst, dst, xmm0, vector_len);
 4700     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4701   } else {
 4702     // worse case scenario, all regs are in the upper bank
 4703     subptr(rsp, 64);
 4704     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
 4705     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
 4706     evmovdqul(xmm1, src, Assembler::AVX_512bit);
 4707     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4708     Assembler::vpsubw(xmm0, xmm0, xmm1, vector_len);
 4709     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
 4710     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4711     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
 4712     addptr(rsp, 64);
 4713   }
 4714 }
 4715 
 4716 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
 4717   int dst_enc = dst->encoding();
 4718   int nds_enc = nds->encoding();
 4719   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
 4720     Assembler::vpsubw(dst, nds, src, vector_len);
 4721   } else if (dst_enc < 16) {
 4722     Assembler::vpsubw(dst, dst, src, vector_len);
 4723   } else if (nds_enc < 16) {
 4724     // implies dst_enc in upper bank with src as scratch
 4725     evmovdqul(nds, dst, Assembler::AVX_512bit);
 4726     Assembler::vpsubw(nds, nds, src, vector_len);
 4727     evmovdqul(dst, nds, Assembler::AVX_512bit);
 4728   } else {
 4729     // worse case scenario, all regs in upper bank
 4730     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
 4731     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4732     Assembler::vpsubw(xmm0, xmm0, src, vector_len);
 4733     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4734   }
 4735 }
 4736 
 4737 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
 4738   int dst_enc = dst->encoding();
 4739   int nds_enc = nds->encoding();
 4740   int shift_enc = shift->encoding();
 4741   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
 4742     Assembler::vpsraw(dst, nds, shift, vector_len);
 4743   } else if ((dst_enc < 16) && (shift_enc < 16)) {
 4744     Assembler::vpsraw(dst, dst, shift, vector_len);
 4745   } else if ((dst_enc < 16) && (nds_enc < 16)) {
 4746     // use nds_enc as scratch with shift
 4747     evmovdqul(nds, shift, Assembler::AVX_512bit);
 4748     Assembler::vpsraw(dst, dst, nds, vector_len);
 4749   } else if ((shift_enc < 16) && (nds_enc < 16)) {
 4750     // use nds as scratch with dst
 4751     evmovdqul(nds, dst, Assembler::AVX_512bit);
 4752     Assembler::vpsraw(nds, nds, shift, vector_len);
 4753     evmovdqul(dst, nds, Assembler::AVX_512bit);
 4754   } else if (dst_enc < 16) {
 4755     // use nds to save a copy of xmm0 and hold shift
 4756     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
 4757     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
 4758     Assembler::vpsraw(dst, dst, xmm0, vector_len);
 4759     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4760   } else if (nds_enc < 16) {
 4761     // use nds as dest as temps
 4762     evmovdqul(nds, dst, Assembler::AVX_512bit);
 4763     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
 4764     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
 4765     Assembler::vpsraw(nds, nds, xmm0, vector_len);
 4766     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4767     evmovdqul(dst, nds, Assembler::AVX_512bit);
 4768   } else {
 4769     // worse case scenario, all regs are in the upper bank
 4770     subptr(rsp, 64);
 4771     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
 4772     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
 4773     evmovdqul(xmm1, shift, Assembler::AVX_512bit);
 4774     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4775     Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
 4776     evmovdqul(xmm1, dst, Assembler::AVX_512bit);
 4777     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
 4778     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4779     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
 4780     addptr(rsp, 64);
 4781   }
 4782 }
 4783 
 4784 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
 4785   int dst_enc = dst->encoding();
 4786   int nds_enc = nds->encoding();
 4787   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
 4788     Assembler::vpsraw(dst, nds, shift, vector_len);
 4789   } else if (dst_enc < 16) {
 4790     Assembler::vpsraw(dst, dst, shift, vector_len);
 4791   } else if (nds_enc < 16) {
 4792     // use nds as scratch
 4793     evmovdqul(nds, dst, Assembler::AVX_512bit);
 4794     Assembler::vpsraw(nds, nds, shift, vector_len);
 4795     evmovdqul(dst, nds, Assembler::AVX_512bit);
 4796   } else {
 4797     // use nds as scratch for xmm0
 4798     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
 4799     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4800     Assembler::vpsraw(xmm0, xmm0, shift, vector_len);
 4801     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4802   }
 4803 }
 4804 
 4805 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
 4806   int dst_enc = dst->encoding();
 4807   int nds_enc = nds->encoding();
 4808   int shift_enc = shift->encoding();
 4809   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
 4810     Assembler::vpsrlw(dst, nds, shift, vector_len);
 4811   } else if ((dst_enc < 16) && (shift_enc < 16)) {
 4812     Assembler::vpsrlw(dst, dst, shift, vector_len);
 4813   } else if ((dst_enc < 16) && (nds_enc < 16)) {
 4814     // use nds_enc as scratch with shift
 4815     evmovdqul(nds, shift, Assembler::AVX_512bit);
 4816     Assembler::vpsrlw(dst, dst, nds, vector_len);
 4817   } else if ((shift_enc < 16) && (nds_enc < 16)) {
 4818     // use nds as scratch with dst
 4819     evmovdqul(nds, dst, Assembler::AVX_512bit);
 4820     Assembler::vpsrlw(nds, nds, shift, vector_len);
 4821     evmovdqul(dst, nds, Assembler::AVX_512bit);
 4822   } else if (dst_enc < 16) {
 4823     // use nds to save a copy of xmm0 and hold shift
 4824     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
 4825     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
 4826     Assembler::vpsrlw(dst, dst, xmm0, vector_len);
 4827     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4828   } else if (nds_enc < 16) {
 4829     // use nds as dest as temps
 4830     evmovdqul(nds, dst, Assembler::AVX_512bit);
 4831     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
 4832     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
 4833     Assembler::vpsrlw(nds, nds, xmm0, vector_len);
 4834     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4835     evmovdqul(dst, nds, Assembler::AVX_512bit);
 4836   } else {
 4837     // worse case scenario, all regs are in the upper bank
 4838     subptr(rsp, 64);
 4839     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
 4840     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
 4841     evmovdqul(xmm1, shift, Assembler::AVX_512bit);
 4842     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4843     Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
 4844     evmovdqul(xmm1, dst, Assembler::AVX_512bit);
 4845     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
 4846     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4847     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
 4848     addptr(rsp, 64);
 4849   }
 4850 }
 4851 
 4852 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
 4853   int dst_enc = dst->encoding();
 4854   int nds_enc = nds->encoding();
 4855   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
 4856     Assembler::vpsrlw(dst, nds, shift, vector_len);
 4857   } else if (dst_enc < 16) {
 4858     Assembler::vpsrlw(dst, dst, shift, vector_len);
 4859   } else if (nds_enc < 16) {
 4860     // use nds as scratch
 4861     evmovdqul(nds, dst, Assembler::AVX_512bit);
 4862     Assembler::vpsrlw(nds, nds, shift, vector_len);
 4863     evmovdqul(dst, nds, Assembler::AVX_512bit);
 4864   } else {
 4865     // use nds as scratch for xmm0
 4866     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
 4867     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4868     Assembler::vpsrlw(xmm0, xmm0, shift, vector_len);
 4869     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4870   }
 4871 }
 4872 
 4873 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
 4874   int dst_enc = dst->encoding();
 4875   int nds_enc = nds->encoding();
 4876   int shift_enc = shift->encoding();
 4877   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
 4878     Assembler::vpsllw(dst, nds, shift, vector_len);
 4879   } else if ((dst_enc < 16) && (shift_enc < 16)) {
 4880     Assembler::vpsllw(dst, dst, shift, vector_len);
 4881   } else if ((dst_enc < 16) && (nds_enc < 16)) {
 4882     // use nds_enc as scratch with shift
 4883     evmovdqul(nds, shift, Assembler::AVX_512bit);
 4884     Assembler::vpsllw(dst, dst, nds, vector_len);
 4885   } else if ((shift_enc < 16) && (nds_enc < 16)) {
 4886     // use nds as scratch with dst
 4887     evmovdqul(nds, dst, Assembler::AVX_512bit);
 4888     Assembler::vpsllw(nds, nds, shift, vector_len);
 4889     evmovdqul(dst, nds, Assembler::AVX_512bit);
 4890   } else if (dst_enc < 16) {
 4891     // use nds to save a copy of xmm0 and hold shift
 4892     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
 4893     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
 4894     Assembler::vpsllw(dst, dst, xmm0, vector_len);
 4895     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4896   } else if (nds_enc < 16) {
 4897     // use nds as dest as temps
 4898     evmovdqul(nds, dst, Assembler::AVX_512bit);
 4899     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
 4900     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
 4901     Assembler::vpsllw(nds, nds, xmm0, vector_len);
 4902     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4903     evmovdqul(dst, nds, Assembler::AVX_512bit);
 4904   } else {
 4905     // worse case scenario, all regs are in the upper bank
 4906     subptr(rsp, 64);
 4907     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
 4908     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
 4909     evmovdqul(xmm1, shift, Assembler::AVX_512bit);
 4910     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4911     Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
 4912     evmovdqul(xmm1, dst, Assembler::AVX_512bit);
 4913     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
 4914     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4915     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
 4916     addptr(rsp, 64);
 4917   }
 4918 }
 4919 
 4920 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
 4921   int dst_enc = dst->encoding();
 4922   int nds_enc = nds->encoding();
 4923   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
 4924     Assembler::vpsllw(dst, nds, shift, vector_len);
 4925   } else if (dst_enc < 16) {
 4926     Assembler::vpsllw(dst, dst, shift, vector_len);
 4927   } else if (nds_enc < 16) {
 4928     // use nds as scratch
 4929     evmovdqul(nds, dst, Assembler::AVX_512bit);
 4930     Assembler::vpsllw(nds, nds, shift, vector_len);
 4931     evmovdqul(dst, nds, Assembler::AVX_512bit);
 4932   } else {
 4933     // use nds as scratch for xmm0
 4934     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
 4935     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4936     Assembler::vpsllw(xmm0, xmm0, shift, vector_len);
 4937     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
 4938   }
 4939 }
 4940 
 4941 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
 4942   int dst_enc = dst->encoding();
 4943   int src_enc = src->encoding();
 4944   if ((dst_enc < 16) && (src_enc < 16)) {
 4945     Assembler::vptest(dst, src);
 4946   } else if (src_enc < 16) {
 4947     subptr(rsp, 64);
 4948     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 4949     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4950     Assembler::vptest(xmm0, src);
 4951     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 4952     addptr(rsp, 64);
 4953   } else if (dst_enc < 16) {
 4954     subptr(rsp, 64);
 4955     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 4956     evmovdqul(xmm0, src, Assembler::AVX_512bit);
 4957     Assembler::vptest(dst, xmm0);
 4958     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 4959     addptr(rsp, 64);
 4960   } else {
 4961     subptr(rsp, 64);
 4962     evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 4963     subptr(rsp, 64);
 4964     evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
 4965     movdqu(xmm0, src);
 4966     movdqu(xmm1, dst);
 4967     Assembler::vptest(xmm1, xmm0);
 4968     evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
 4969     addptr(rsp, 64);
 4970     evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 4971     addptr(rsp, 64);
 4972   }
 4973 }
 4974 
 4975 // This instruction exists within macros, ergo we cannot control its input
 4976 // when emitted through those patterns.
 4977 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
 4978   if (VM_Version::supports_avx512nobw()) {
 4979     int dst_enc = dst->encoding();
 4980     int src_enc = src->encoding();
 4981     if (dst_enc == src_enc) {
 4982       if (dst_enc < 16) {
 4983         Assembler::punpcklbw(dst, src);
 4984       } else {
 4985         subptr(rsp, 64);
 4986         evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 4987         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 4988         Assembler::punpcklbw(xmm0, xmm0);
 4989         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
 4990         evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 4991         addptr(rsp, 64);
 4992       }
 4993     } else {
 4994       if ((src_enc < 16) && (dst_enc < 16)) {
 4995         Assembler::punpcklbw(dst, src);
 4996       } else if (src_enc < 16) {
 4997         subptr(rsp, 64);
 4998         evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 4999         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 5000         Assembler::punpcklbw(xmm0, src);
 5001         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
 5002         evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 5003         addptr(rsp, 64);
 5004       } else if (dst_enc < 16) {
 5005         subptr(rsp, 64);
 5006         evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 5007         evmovdqul(xmm0, src, Assembler::AVX_512bit);
 5008         Assembler::punpcklbw(dst, xmm0);
 5009         evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 5010         addptr(rsp, 64);
 5011       } else {
 5012         subptr(rsp, 64);
 5013         evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 5014         subptr(rsp, 64);
 5015         evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
 5016         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 5017         evmovdqul(xmm1, src, Assembler::AVX_512bit);
 5018         Assembler::punpcklbw(xmm0, xmm1);
 5019         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
 5020         evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
 5021         addptr(rsp, 64);
 5022         evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 5023         addptr(rsp, 64);
 5024       }
 5025     }
 5026   } else {
 5027     Assembler::punpcklbw(dst, src);
 5028   }
 5029 }
 5030 
 5031 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
 5032   if (VM_Version::supports_avx512vl()) {
 5033     Assembler::pshufd(dst, src, mode);
 5034   } else {
 5035     int dst_enc = dst->encoding();
 5036     if (dst_enc < 16) {
 5037       Assembler::pshufd(dst, src, mode);
 5038     } else {
 5039       subptr(rsp, 64);
 5040       evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 5041       Assembler::pshufd(xmm0, src, mode);
 5042       evmovdqul(dst, xmm0, Assembler::AVX_512bit);
 5043       evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 5044       addptr(rsp, 64);
 5045     }
 5046   }
 5047 }
 5048 
 5049 // This instruction exists within macros, ergo we cannot control its input
 5050 // when emitted through those patterns.
 5051 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
 5052   if (VM_Version::supports_avx512nobw()) {
 5053     int dst_enc = dst->encoding();
 5054     int src_enc = src->encoding();
 5055     if (dst_enc == src_enc) {
 5056       if (dst_enc < 16) {
 5057         Assembler::pshuflw(dst, src, mode);
 5058       } else {
 5059         subptr(rsp, 64);
 5060         evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 5061         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 5062         Assembler::pshuflw(xmm0, xmm0, mode);
 5063         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
 5064         evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 5065         addptr(rsp, 64);
 5066       }
 5067     } else {
 5068       if ((src_enc < 16) && (dst_enc < 16)) {
 5069         Assembler::pshuflw(dst, src, mode);
 5070       } else if (src_enc < 16) {
 5071         subptr(rsp, 64);
 5072         evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 5073         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 5074         Assembler::pshuflw(xmm0, src, mode);
 5075         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
 5076         evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 5077         addptr(rsp, 64);
 5078       } else if (dst_enc < 16) {
 5079         subptr(rsp, 64);
 5080         evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 5081         evmovdqul(xmm0, src, Assembler::AVX_512bit);
 5082         Assembler::pshuflw(dst, xmm0, mode);
 5083         evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 5084         addptr(rsp, 64);
 5085       } else {
 5086         subptr(rsp, 64);
 5087         evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 5088         subptr(rsp, 64);
 5089         evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
 5090         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
 5091         evmovdqul(xmm1, src, Assembler::AVX_512bit);
 5092         Assembler::pshuflw(xmm0, xmm1, mode);
 5093         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
 5094         evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
 5095         addptr(rsp, 64);
 5096         evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 5097         addptr(rsp, 64);
 5098       }
 5099     }
 5100   } else {
 5101     Assembler::pshuflw(dst, src, mode);
 5102   }
 5103 }
 5104 
 5105 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
 5106   if (reachable(src)) {
 5107     vandpd(dst, nds, as_Address(src), vector_len);
 5108   } else {
 5109     lea(rscratch1, src);
 5110     vandpd(dst, nds, Address(rscratch1, 0), vector_len);
 5111   }
 5112 }
 5113 
 5114 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
 5115   if (reachable(src)) {
 5116     vandps(dst, nds, as_Address(src), vector_len);
 5117   } else {
 5118     lea(rscratch1, src);
 5119     vandps(dst, nds, Address(rscratch1, 0), vector_len);
 5120   }
 5121 }
 5122 
 5123 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
 5124   if (reachable(src)) {
 5125     vdivsd(dst, nds, as_Address(src));
 5126   } else {
 5127     lea(rscratch1, src);
 5128     vdivsd(dst, nds, Address(rscratch1, 0));
 5129   }
 5130 }
 5131 
 5132 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
 5133   if (reachable(src)) {
 5134     vdivss(dst, nds, as_Address(src));
 5135   } else {
 5136     lea(rscratch1, src);
 5137     vdivss(dst, nds, Address(rscratch1, 0));
 5138   }
 5139 }
 5140 
 5141 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
 5142   if (reachable(src)) {
 5143     vmulsd(dst, nds, as_Address(src));
 5144   } else {
 5145     lea(rscratch1, src);
 5146     vmulsd(dst, nds, Address(rscratch1, 0));
 5147   }
 5148 }
 5149 
 5150 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
 5151   if (reachable(src)) {
 5152     vmulss(dst, nds, as_Address(src));
 5153   } else {
 5154     lea(rscratch1, src);
 5155     vmulss(dst, nds, Address(rscratch1, 0));
 5156   }
 5157 }
 5158 
 5159 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
 5160   if (reachable(src)) {
 5161     vsubsd(dst, nds, as_Address(src));
 5162   } else {
 5163     lea(rscratch1, src);
 5164     vsubsd(dst, nds, Address(rscratch1, 0));
 5165   }
 5166 }
 5167 
 5168 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
 5169   if (reachable(src)) {
 5170     vsubss(dst, nds, as_Address(src));
 5171   } else {
 5172     lea(rscratch1, src);
 5173     vsubss(dst, nds, Address(rscratch1, 0));
 5174   }
 5175 }
 5176 
 5177 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
 5178   int nds_enc = nds->encoding();
 5179   int dst_enc = dst->encoding();
 5180   bool dst_upper_bank = (dst_enc > 15);
 5181   bool nds_upper_bank = (nds_enc > 15);
 5182   if (VM_Version::supports_avx512novl() &&
 5183       (nds_upper_bank || dst_upper_bank)) {
 5184     if (dst_upper_bank) {
 5185       subptr(rsp, 64);
 5186       evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 5187       movflt(xmm0, nds);
 5188       vxorps(xmm0, xmm0, src, Assembler::AVX_128bit);
 5189       movflt(dst, xmm0);
 5190       evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 5191       addptr(rsp, 64);
 5192     } else {
 5193       movflt(dst, nds);
 5194       vxorps(dst, dst, src, Assembler::AVX_128bit);
 5195     }
 5196   } else {
 5197     vxorps(dst, nds, src, Assembler::AVX_128bit);
 5198   }
 5199 }
 5200 
 5201 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
 5202   int nds_enc = nds->encoding();
 5203   int dst_enc = dst->encoding();
 5204   bool dst_upper_bank = (dst_enc > 15);
 5205   bool nds_upper_bank = (nds_enc > 15);
 5206   if (VM_Version::supports_avx512novl() &&
 5207       (nds_upper_bank || dst_upper_bank)) {
 5208     if (dst_upper_bank) {
 5209       subptr(rsp, 64);
 5210       evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
 5211       movdbl(xmm0, nds);
 5212       vxorpd(xmm0, xmm0, src, Assembler::AVX_128bit);
 5213       movdbl(dst, xmm0);
 5214       evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
 5215       addptr(rsp, 64);
 5216     } else {
 5217       movdbl(dst, nds);
 5218       vxorpd(dst, dst, src, Assembler::AVX_128bit);
 5219     }
 5220   } else {
 5221     vxorpd(dst, nds, src, Assembler::AVX_128bit);
 5222   }
 5223 }
 5224 
 5225 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
 5226   if (reachable(src)) {
 5227     vxorpd(dst, nds, as_Address(src), vector_len);
 5228   } else {
 5229     lea(rscratch1, src);
 5230     vxorpd(dst, nds, Address(rscratch1, 0), vector_len);
 5231   }
 5232 }
 5233 
 5234 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
 5235   if (reachable(src)) {
 5236     vxorps(dst, nds, as_Address(src), vector_len);
 5237   } else {
 5238     lea(rscratch1, src);
 5239     vxorps(dst, nds, Address(rscratch1, 0), vector_len);
 5240   }
 5241 }
 5242 
 5243 
 5244 void MacroAssembler::resolve_jobject(Register value,
 5245                                      Register thread,
 5246                                      Register tmp) {
 5247   assert_different_registers(value, thread, tmp);
 5248   Label done, not_weak;
 5249   testptr(value, value);
 5250   jcc(Assembler::zero, done);                // Use NULL as-is.
 5251   testptr(value, JNIHandles::weak_tag_mask); // Test for jweak tag.
 5252   jcc(Assembler::zero, not_weak);
 5253   // Resolve jweak.
 5254   movptr(value, Address(value, -JNIHandles::weak_tag_value));
 5255   verify_oop(value);
 5256 #if INCLUDE_ALL_GCS
 5257   if (UseG1GC) {
 5258     g1_write_barrier_pre(noreg /* obj */,
 5259                          value /* pre_val */,
 5260                          thread /* thread */,
 5261                          tmp /* tmp */,
 5262                          true /* tosca_live */,
 5263                          true /* expand_call */);
 5264   }
 5265 #endif // INCLUDE_ALL_GCS
 5266   jmp(done);
 5267   bind(not_weak);
 5268   // Resolve (untagged) jobject.
 5269   movptr(value, Address(value, 0));
 5270   verify_oop(value);
 5271   bind(done);
 5272 }
 5273 
 5274 void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
 5275   const int32_t inverted_jweak_mask = ~static_cast<int32_t>(JNIHandles::weak_tag_mask);
 5276   STATIC_ASSERT(inverted_jweak_mask == -2); // otherwise check this code
 5277   // The inverted mask is sign-extended
 5278   andptr(possibly_jweak, inverted_jweak_mask);
 5279 }
 5280 
 5281 //////////////////////////////////////////////////////////////////////////////////
 5282 #if INCLUDE_ALL_GCS
 5283 
 5284 void MacroAssembler::g1_write_barrier_pre(Register obj,
 5285                                           Register pre_val,
 5286                                           Register thread,
 5287                                           Register tmp,
 5288                                           bool tosca_live,
 5289                                           bool expand_call) {
 5290 
 5291   // If expand_call is true then we expand the call_VM_leaf macro
 5292   // directly to skip generating the check by
 5293   // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
 5294 
 5295 #ifdef _LP64
 5296   assert(thread == r15_thread, "must be");
 5297 #endif // _LP64
 5298 
 5299   Label done;
 5300   Label runtime;
 5301 
 5302   assert(pre_val != noreg, "check this code");
 5303 
 5304   if (obj != noreg) {
 5305     assert_different_registers(obj, pre_val, tmp);
 5306     assert(pre_val != rax, "check this code");
 5307   }
 5308 
 5309   Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
 5310                                        SATBMarkQueue::byte_offset_of_active()));
 5311   Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
 5312                                        SATBMarkQueue::byte_offset_of_index()));
 5313   Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
 5314                                        SATBMarkQueue::byte_offset_of_buf()));
 5315 
 5316 
 5317   // Is marking active?
 5318   if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
 5319     cmpl(in_progress, 0);
 5320   } else {
 5321     assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
 5322     cmpb(in_progress, 0);
 5323   }
 5324   jcc(Assembler::equal, done);
 5325 
 5326   // Do we need to load the previous value?
 5327   if (obj != noreg) {
 5328     load_heap_oop(pre_val, Address(obj, 0));
 5329   }
 5330 
 5331   // Is the previous value null?
 5332   cmpptr(pre_val, (int32_t) NULL_WORD);
 5333   jcc(Assembler::equal, done);
 5334 
 5335   // Can we store original value in the thread's buffer?
 5336   // Is index == 0?
 5337   // (The index field is typed as size_t.)
 5338 
 5339   movptr(tmp, index);                   // tmp := *index_adr
 5340   cmpptr(tmp, 0);                       // tmp == 0?
 5341   jcc(Assembler::equal, runtime);       // If yes, goto runtime
 5342 
 5343   subptr(tmp, wordSize);                // tmp := tmp - wordSize
 5344   movptr(index, tmp);                   // *index_adr := tmp
 5345   addptr(tmp, buffer);                  // tmp := tmp + *buffer_adr
 5346 
 5347   // Record the previous value
 5348   movptr(Address(tmp, 0), pre_val);
 5349   jmp(done);
 5350 
 5351   bind(runtime);
 5352   // save the live input values
 5353   if(tosca_live) push(rax);
 5354 
 5355   if (obj != noreg && obj != rax)
 5356     push(obj);
 5357 
 5358   if (pre_val != rax)
 5359     push(pre_val);
 5360 
 5361   // Calling the runtime using the regular call_VM_leaf mechanism generates
 5362   // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
 5363   // that checks that the *(ebp+frame::interpreter_frame_last_sp) == NULL.
 5364   //
 5365   // If we care generating the pre-barrier without a frame (e.g. in the
 5366   // intrinsified Reference.get() routine) then ebp might be pointing to
 5367   // the caller frame and so this check will most likely fail at runtime.
 5368   //
 5369   // Expanding the call directly bypasses the generation of the check.
 5370   // So when we do not have have a full interpreter frame on the stack
 5371   // expand_call should be passed true.
 5372 
 5373   NOT_LP64( push(thread); )
 5374 
 5375   if (expand_call) {
 5376     LP64_ONLY( assert(pre_val != c_rarg1, "smashed arg"); )
 5377     pass_arg1(this, thread);
 5378     pass_arg0(this, pre_val);
 5379     MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
 5380   } else {
 5381     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
 5382   }
 5383 
 5384   NOT_LP64( pop(thread); )
 5385 
 5386   // save the live input values
 5387   if (pre_val != rax)
 5388     pop(pre_val);
 5389 
 5390   if (obj != noreg && obj != rax)
 5391     pop(obj);
 5392 
 5393   if(tosca_live) pop(rax);
 5394 
 5395   bind(done);
 5396 }
 5397 
 5398 void MacroAssembler::g1_write_barrier_post(Register store_addr,
 5399                                            Register new_val,
 5400                                            Register thread,
 5401                                            Register tmp,
 5402                                            Register tmp2) {
 5403 #ifdef _LP64
 5404   assert(thread == r15_thread, "must be");
 5405 #endif // _LP64
 5406 
 5407   Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
 5408                                        DirtyCardQueue::byte_offset_of_index()));
 5409   Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
 5410                                        DirtyCardQueue::byte_offset_of_buf()));
 5411 
 5412   CardTableModRefBS* ctbs =
 5413     barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set());
 5414   CardTable* ct = ctbs->card_table();
 5415   assert(sizeof(*ct->byte_map_base()) == sizeof(jbyte), "adjust this code");
 5416 
 5417   Label done;
 5418   Label runtime;
 5419 
 5420   // Does store cross heap regions?
 5421 
 5422   movptr(tmp, store_addr);
 5423   xorptr(tmp, new_val);
 5424   shrptr(tmp, HeapRegion::LogOfHRGrainBytes);
 5425   jcc(Assembler::equal, done);
 5426 
 5427   // crosses regions, storing NULL?
 5428 
 5429   cmpptr(new_val, (int32_t) NULL_WORD);
 5430   jcc(Assembler::equal, done);
 5431 
 5432   // storing region crossing non-NULL, is card already dirty?
 5433 
 5434   const Register card_addr = tmp;
 5435   const Register cardtable = tmp2;
 5436 
 5437   movptr(card_addr, store_addr);
 5438   shrptr(card_addr, CardTable::card_shift);
 5439   // Do not use ExternalAddress to load 'byte_map_base', since 'byte_map_base' is NOT
 5440   // a valid address and therefore is not properly handled by the relocation code.
 5441   movptr(cardtable, (intptr_t)ct->byte_map_base());
 5442   addptr(card_addr, cardtable);
 5443 
 5444   cmpb(Address(card_addr, 0), (int)G1CardTable::g1_young_card_val());
 5445   jcc(Assembler::equal, done);
 5446 
 5447   membar(Assembler::Membar_mask_bits(Assembler::StoreLoad));
 5448   cmpb(Address(card_addr, 0), (int)CardTable::dirty_card_val());
 5449   jcc(Assembler::equal, done);
 5450 
 5451 
 5452   // storing a region crossing, non-NULL oop, card is clean.
 5453   // dirty card and log.
 5454 
 5455   movb(Address(card_addr, 0), (int)CardTable::dirty_card_val());
 5456 
 5457   cmpl(queue_index, 0);
 5458   jcc(Assembler::equal, runtime);
 5459   subl(queue_index, wordSize);
 5460   movptr(tmp2, buffer);
 5461 #ifdef _LP64
 5462   movslq(rscratch1, queue_index);
 5463   addq(tmp2, rscratch1);
 5464   movq(Address(tmp2, 0), card_addr);
 5465 #else
 5466   addl(tmp2, queue_index);
 5467   movl(Address(tmp2, 0), card_addr);
 5468 #endif
 5469   jmp(done);
 5470 
 5471   bind(runtime);
 5472   // save the live input values
 5473   push(store_addr);
 5474   push(new_val);
 5475 #ifdef _LP64
 5476   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, r15_thread);
 5477 #else
 5478   push(thread);
 5479   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
 5480   pop(thread);
 5481 #endif
 5482   pop(new_val);
 5483   pop(store_addr);
 5484 
 5485   bind(done);
 5486 }
 5487 
 5488 #endif // INCLUDE_ALL_GCS
 5489 //////////////////////////////////////////////////////////////////////////////////
 5490 
 5491 
 5492 void MacroAssembler::store_check(Register obj, Address dst) {
 5493   store_check(obj);
 5494 }
 5495 
 5496 void MacroAssembler::store_check(Register obj) {
 5497   // Does a store check for the oop in register obj. The content of
 5498   // register obj is destroyed afterwards.
 5499   BarrierSet* bs = Universe::heap()->barrier_set();
 5500   assert(bs->kind() == BarrierSet::CardTableModRef,
 5501          "Wrong barrier set kind");
 5502 
 5503   CardTableModRefBS* ctbs = barrier_set_cast<CardTableModRefBS>(bs);
 5504   CardTable* ct = ctbs->card_table();
 5505   assert(sizeof(*ct->byte_map_base()) == sizeof(jbyte), "adjust this code");
 5506 
 5507   shrptr(obj, CardTable::card_shift);
 5508 
 5509   Address card_addr;
 5510 
 5511   // The calculation for byte_map_base is as follows:
 5512   // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
 5513   // So this essentially converts an address to a displacement and it will
 5514   // never need to be relocated. On 64bit however the value may be too
 5515   // large for a 32bit displacement.
 5516   intptr_t disp = (intptr_t) ct->byte_map_base();
 5517   if (is_simm32(disp)) {
 5518     card_addr = Address(noreg, obj, Address::times_1, disp);
 5519   } else {
 5520     // By doing it as an ExternalAddress 'disp' could be converted to a rip-relative
 5521     // displacement and done in a single instruction given favorable mapping and a
 5522     // smarter version of as_Address. However, 'ExternalAddress' generates a relocation
 5523     // entry and that entry is not properly handled by the relocation code.
 5524     AddressLiteral cardtable((address)ct->byte_map_base(), relocInfo::none);
 5525     Address index(noreg, obj, Address::times_1);
 5526     card_addr = as_Address(ArrayAddress(cardtable, index));
 5527   }
 5528 
 5529   int dirty = CardTable::dirty_card_val();
 5530   if (UseCondCardMark) {
 5531     Label L_already_dirty;
 5532     if (UseConcMarkSweepGC) {
 5533       membar(Assembler::StoreLoad);
 5534     }
 5535     cmpb(card_addr, dirty);
 5536     jcc(Assembler::equal, L_already_dirty);
 5537     movb(card_addr, dirty);
 5538     bind(L_already_dirty);
 5539   } else {
 5540     movb(card_addr, dirty);
 5541   }
 5542 }
 5543 
 5544 void MacroAssembler::subptr(Register dst, int32_t imm32) {
 5545   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
 5546 }
 5547 
 5548 // Force generation of a 4 byte immediate value even if it fits into 8bit
 5549 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
 5550   LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
 5551 }
 5552 
 5553 void MacroAssembler::subptr(Register dst, Register src) {
 5554   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
 5555 }
 5556 
 5557 // C++ bool manipulation
 5558 void MacroAssembler::testbool(Register dst) {
 5559   if(sizeof(bool) == 1)
 5560     testb(dst, 0xff);
 5561   else if(sizeof(bool) == 2) {
 5562     // testw implementation needed for two byte bools
 5563     ShouldNotReachHere();
 5564   } else if(sizeof(bool) == 4)
 5565     testl(dst, dst);
 5566   else
 5567     // unsupported
 5568     ShouldNotReachHere();
 5569 }
 5570 
 5571 void MacroAssembler::testptr(Register dst, Register src) {
 5572   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
 5573 }
 5574 
 5575 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
 5576 void MacroAssembler::tlab_allocate(Register obj,
 5577                                    Register var_size_in_bytes,
 5578                                    int con_size_in_bytes,
 5579                                    Register t1,
 5580                                    Register t2,
 5581                                    Label& slow_case) {
 5582   assert_different_registers(obj, t1, t2);
 5583   assert_different_registers(obj, var_size_in_bytes, t1);
 5584   Register end = t2;
 5585   Register thread = NOT_LP64(t1) LP64_ONLY(r15_thread);
 5586 
 5587   verify_tlab();
 5588 
 5589   NOT_LP64(get_thread(thread));
 5590 
 5591   movptr(obj, Address(thread, JavaThread::tlab_top_offset()));
 5592   if (var_size_in_bytes == noreg) {
 5593     lea(end, Address(obj, con_size_in_bytes));
 5594   } else {
 5595     lea(end, Address(obj, var_size_in_bytes, Address::times_1));
 5596   }
 5597   cmpptr(end, Address(thread, JavaThread::tlab_end_offset()));
 5598   jcc(Assembler::above, slow_case);
 5599 
 5600   // update the tlab top pointer
 5601   movptr(Address(thread, JavaThread::tlab_top_offset()), end);
 5602 
 5603   // recover var_size_in_bytes if necessary
 5604   if (var_size_in_bytes == end) {
 5605     subptr(var_size_in_bytes, obj);
 5606   }
 5607   verify_tlab();
 5608 }
 5609 
 5610 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
 5611 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
 5612   assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
 5613   assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
 5614   Label done;
 5615 
 5616   testptr(length_in_bytes, length_in_bytes);
 5617   jcc(Assembler::zero, done);
 5618 
 5619   // initialize topmost word, divide index by 2, check if odd and test if zero
 5620   // note: for the remaining code to work, index must be a multiple of BytesPerWord
 5621 #ifdef ASSERT
 5622   {
 5623     Label L;
 5624     testptr(length_in_bytes, BytesPerWord - 1);
 5625     jcc(Assembler::zero, L);
 5626     stop("length must be a multiple of BytesPerWord");
 5627     bind(L);
 5628   }
 5629 #endif
 5630   Register index = length_in_bytes;
 5631   xorptr(temp, temp);    // use _zero reg to clear memory (shorter code)
 5632   if (UseIncDec) {
 5633     shrptr(index, 3);  // divide by 8/16 and set carry flag if bit 2 was set
 5634   } else {
 5635     shrptr(index, 2);  // use 2 instructions to avoid partial flag stall
 5636     shrptr(index, 1);
 5637   }
 5638 #ifndef _LP64
 5639   // index could have not been a multiple of 8 (i.e., bit 2 was set)
 5640   {
 5641     Label even;
 5642     // note: if index was a multiple of 8, then it cannot
 5643     //       be 0 now otherwise it must have been 0 before
 5644     //       => if it is even, we don't need to check for 0 again
 5645     jcc(Assembler::carryClear, even);
 5646     // clear topmost word (no jump would be needed if conditional assignment worked here)
 5647     movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
 5648     // index could be 0 now, must check again
 5649     jcc(Assembler::zero, done);
 5650     bind(even);
 5651   }
 5652 #endif // !_LP64
 5653   // initialize remaining object fields: index is a multiple of 2 now
 5654   {
 5655     Label loop;
 5656     bind(loop);
 5657     movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
 5658     NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
 5659     decrement(index);
 5660     jcc(Assembler::notZero, loop);
 5661   }
 5662 
 5663   bind(done);
 5664 }
 5665 
 5666 void MacroAssembler::incr_allocated_bytes(Register thread,
 5667                                           Register var_size_in_bytes,
 5668                                           int con_size_in_bytes,
 5669                                           Register t1) {
 5670   if (!thread->is_valid()) {
 5671 #ifdef _LP64
 5672     thread = r15_thread;
 5673 #else
 5674     assert(t1->is_valid(), "need temp reg");
 5675     thread = t1;
 5676     get_thread(thread);
 5677 #endif
 5678   }
 5679 
 5680 #ifdef _LP64
 5681   if (var_size_in_bytes->is_valid()) {
 5682     addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
 5683   } else {
 5684     addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
 5685   }
 5686 #else
 5687   if (var_size_in_bytes->is_valid()) {
 5688     addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
 5689   } else {
 5690     addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
 5691   }
 5692   adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
 5693 #endif
 5694 }
 5695 
 5696 // Look up the method for a megamorphic invokeinterface call.
 5697 // The target method is determined by <intf_klass, itable_index>.
 5698 // The receiver klass is in recv_klass.
 5699 // On success, the result will be in method_result, and execution falls through.
 5700 // On failure, execution transfers to the given label.
 5701 void MacroAssembler::lookup_interface_method(Register recv_klass,
 5702                                              Register intf_klass,
 5703                                              RegisterOrConstant itable_index,
 5704                                              Register method_result,
 5705                                              Register scan_temp,
 5706                                              Label& L_no_such_interface,
 5707                                              bool return_method) {
 5708   assert_different_registers(recv_klass, intf_klass, scan_temp);
 5709   assert_different_registers(method_result, intf_klass, scan_temp);
 5710   assert(recv_klass != method_result || !return_method,
 5711          "recv_klass can be destroyed when method isn't needed");
 5712 
 5713   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
 5714          "caller must use same register for non-constant itable index as for method");
 5715 
 5716   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 5717   int vtable_base = in_bytes(Klass::vtable_start_offset());
 5718   int itentry_off = itableMethodEntry::method_offset_in_bytes();
 5719   int scan_step   = itableOffsetEntry::size() * wordSize;
 5720   int vte_size    = vtableEntry::size_in_bytes();
 5721   Address::ScaleFactor times_vte_scale = Address::times_ptr;
 5722   assert(vte_size == wordSize, "else adjust times_vte_scale");
 5723 
 5724   movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
 5725 
 5726   // %%% Could store the aligned, prescaled offset in the klassoop.
 5727   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
 5728 
 5729   if (return_method) {
 5730     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
 5731     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
 5732     lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
 5733   }
 5734 
 5735   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
 5736   //   if (scan->interface() == intf) {
 5737   //     result = (klass + scan->offset() + itable_index);
 5738   //   }
 5739   // }
 5740   Label search, found_method;
 5741 
 5742   for (int peel = 1; peel >= 0; peel--) {
 5743     movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
 5744     cmpptr(intf_klass, method_result);
 5745 
 5746     if (peel) {
 5747       jccb(Assembler::equal, found_method);
 5748     } else {
 5749       jccb(Assembler::notEqual, search);
 5750       // (invert the test to fall through to found_method...)
 5751     }
 5752 
 5753     if (!peel)  break;
 5754 
 5755     bind(search);
 5756 
 5757     // Check that the previous entry is non-null.  A null entry means that
 5758     // the receiver class doesn't implement the interface, and wasn't the
 5759     // same as when the caller was compiled.
 5760     testptr(method_result, method_result);
 5761     jcc(Assembler::zero, L_no_such_interface);
 5762     addptr(scan_temp, scan_step);
 5763   }
 5764 
 5765   bind(found_method);
 5766 
 5767   if (return_method) {
 5768     // Got a hit.
 5769     movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
 5770     movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
 5771   }
 5772 }
 5773 
 5774 
 5775 // virtual method calling
 5776 void MacroAssembler::lookup_virtual_method(Register recv_klass,
 5777                                            RegisterOrConstant vtable_index,
 5778                                            Register method_result) {
 5779   const int base = in_bytes(Klass::vtable_start_offset());
 5780   assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
 5781   Address vtable_entry_addr(recv_klass,
 5782                             vtable_index, Address::times_ptr,
 5783                             base + vtableEntry::method_offset_in_bytes());
 5784   movptr(method_result, vtable_entry_addr);
 5785 }
 5786 
 5787 
 5788 void MacroAssembler::check_klass_subtype(Register sub_klass,
 5789                            Register super_klass,
 5790                            Register temp_reg,
 5791                            Label& L_success) {
 5792   Label L_failure;
 5793   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
 5794   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
 5795   bind(L_failure);
 5796 }
 5797 
 5798 
 5799 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
 5800                                                    Register super_klass,
 5801                                                    Register temp_reg,
 5802                                                    Label* L_success,
 5803                                                    Label* L_failure,
 5804                                                    Label* L_slow_path,
 5805                                         RegisterOrConstant super_check_offset) {
 5806   assert_different_registers(sub_klass, super_klass, temp_reg);
 5807   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
 5808   if (super_check_offset.is_register()) {
 5809     assert_different_registers(sub_klass, super_klass,
 5810                                super_check_offset.as_register());
 5811   } else if (must_load_sco) {
 5812     assert(temp_reg != noreg, "supply either a temp or a register offset");
 5813   }
 5814 
 5815   Label L_fallthrough;
 5816   int label_nulls = 0;
 5817   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
 5818   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
 5819   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
 5820   assert(label_nulls <= 1, "at most one NULL in the batch");
 5821 
 5822   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
 5823   int sco_offset = in_bytes(Klass::super_check_offset_offset());
 5824   Address super_check_offset_addr(super_klass, sco_offset);
 5825 
 5826   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
 5827   // range of a jccb.  If this routine grows larger, reconsider at
 5828   // least some of these.
 5829 #define local_jcc(assembler_cond, label)                                \
 5830   if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
 5831   else                             jcc( assembler_cond, label) /*omit semi*/
 5832 
 5833   // Hacked jmp, which may only be used just before L_fallthrough.
 5834 #define final_jmp(label)                                                \
 5835   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
 5836   else                            jmp(label)                /*omit semi*/
 5837 
 5838   // If the pointers are equal, we are done (e.g., String[] elements).
 5839   // This self-check enables sharing of secondary supertype arrays among
 5840   // non-primary types such as array-of-interface.  Otherwise, each such
 5841   // type would need its own customized SSA.
 5842   // We move this check to the front of the fast path because many
 5843   // type checks are in fact trivially successful in this manner,
 5844   // so we get a nicely predicted branch right at the start of the check.
 5845   cmpptr(sub_klass, super_klass);
 5846   local_jcc(Assembler::equal, *L_success);
 5847 
 5848   // Check the supertype display:
 5849   if (must_load_sco) {
 5850     // Positive movl does right thing on LP64.
 5851     movl(temp_reg, super_check_offset_addr);
 5852     super_check_offset = RegisterOrConstant(temp_reg);
 5853   }
 5854   Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
 5855   cmpptr(super_klass, super_check_addr); // load displayed supertype
 5856 
 5857   // This check has worked decisively for primary supers.
 5858   // Secondary supers are sought in the super_cache ('super_cache_addr').
 5859   // (Secondary supers are interfaces and very deeply nested subtypes.)
 5860   // This works in the same check above because of a tricky aliasing
 5861   // between the super_cache and the primary super display elements.
 5862   // (The 'super_check_addr' can address either, as the case requires.)
 5863   // Note that the cache is updated below if it does not help us find
 5864   // what we need immediately.
 5865   // So if it was a primary super, we can just fail immediately.
 5866   // Otherwise, it's the slow path for us (no success at this point).
 5867 
 5868   if (super_check_offset.is_register()) {
 5869     local_jcc(Assembler::equal, *L_success);
 5870     cmpl(super_check_offset.as_register(), sc_offset);
 5871     if (L_failure == &L_fallthrough) {
 5872       local_jcc(Assembler::equal, *L_slow_path);
 5873     } else {
 5874       local_jcc(Assembler::notEqual, *L_failure);
 5875       final_jmp(*L_slow_path);
 5876     }
 5877   } else if (super_check_offset.as_constant() == sc_offset) {
 5878     // Need a slow path; fast failure is impossible.
 5879     if (L_slow_path == &L_fallthrough) {
 5880       local_jcc(Assembler::equal, *L_success);
 5881     } else {
 5882       local_jcc(Assembler::notEqual, *L_slow_path);
 5883       final_jmp(*L_success);
 5884     }
 5885   } else {
 5886     // No slow path; it's a fast decision.
 5887     if (L_failure == &L_fallthrough) {
 5888       local_jcc(Assembler::equal, *L_success);
 5889     } else {
 5890       local_jcc(Assembler::notEqual, *L_failure);
 5891       final_jmp(*L_success);
 5892     }
 5893   }
 5894 
 5895   bind(L_fallthrough);
 5896 
 5897 #undef local_jcc
 5898 #undef final_jmp
 5899 }
 5900 
 5901 
 5902 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
 5903                                                    Register super_klass,
 5904                                                    Register temp_reg,
 5905                                                    Register temp2_reg,
 5906                                                    Label* L_success,
 5907                                                    Label* L_failure,
 5908                                                    bool set_cond_codes) {
 5909   assert_different_registers(sub_klass, super_klass, temp_reg);
 5910   if (temp2_reg != noreg)
 5911     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
 5912 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
 5913 
 5914   Label L_fallthrough;
 5915   int label_nulls = 0;
 5916   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
 5917   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
 5918   assert(label_nulls <= 1, "at most one NULL in the batch");
 5919 
 5920   // a couple of useful fields in sub_klass:
 5921   int ss_offset = in_bytes(Klass::secondary_supers_offset());
 5922   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
 5923   Address secondary_supers_addr(sub_klass, ss_offset);
 5924   Address super_cache_addr(     sub_klass, sc_offset);
 5925 
 5926   // Do a linear scan of the secondary super-klass chain.
 5927   // This code is rarely used, so simplicity is a virtue here.
 5928   // The repne_scan instruction uses fixed registers, which we must spill.
 5929   // Don't worry too much about pre-existing connections with the input regs.
 5930 
 5931   assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
 5932   assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
 5933 
 5934   // Get super_klass value into rax (even if it was in rdi or rcx).
 5935   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
 5936   if (super_klass != rax || UseCompressedOops) {
 5937     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
 5938     mov(rax, super_klass);
 5939   }
 5940   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
 5941   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
 5942 
 5943 #ifndef PRODUCT
 5944   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
 5945   ExternalAddress pst_counter_addr((address) pst_counter);
 5946   NOT_LP64(  incrementl(pst_counter_addr) );
 5947   LP64_ONLY( lea(rcx, pst_counter_addr) );
 5948   LP64_ONLY( incrementl(Address(rcx, 0)) );
 5949 #endif //PRODUCT
 5950 
 5951   // We will consult the secondary-super array.
 5952   movptr(rdi, secondary_supers_addr);
 5953   // Load the array length.  (Positive movl does right thing on LP64.)
 5954   movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
 5955   // Skip to start of data.
 5956   addptr(rdi, Array<Klass*>::base_offset_in_bytes());
 5957 
 5958   // Scan RCX words at [RDI] for an occurrence of RAX.
 5959   // Set NZ/Z based on last compare.
 5960   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
 5961   // not change flags (only scas instruction which is repeated sets flags).
 5962   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
 5963 
 5964     testptr(rax,rax); // Set Z = 0
 5965     repne_scan();
 5966 
 5967   // Unspill the temp. registers:
 5968   if (pushed_rdi)  pop(rdi);
 5969   if (pushed_rcx)  pop(rcx);
 5970   if (pushed_rax)  pop(rax);
 5971 
 5972   if (set_cond_codes) {
 5973     // Special hack for the AD files:  rdi is guaranteed non-zero.
 5974     assert(!pushed_rdi, "rdi must be left non-NULL");
 5975     // Also, the condition codes are properly set Z/NZ on succeed/failure.
 5976   }
 5977 
 5978   if (L_failure == &L_fallthrough)
 5979         jccb(Assembler::notEqual, *L_failure);
 5980   else  jcc(Assembler::notEqual, *L_failure);
 5981 
 5982   // Success.  Cache the super we found and proceed in triumph.
 5983   movptr(super_cache_addr, super_klass);
 5984 
 5985   if (L_success != &L_fallthrough) {
 5986     jmp(*L_success);
 5987   }
 5988 
 5989 #undef IS_A_TEMP
 5990 
 5991   bind(L_fallthrough);
 5992 }
 5993 
 5994 
 5995 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
 5996   if (VM_Version::supports_cmov()) {
 5997     cmovl(cc, dst, src);
 5998   } else {
 5999     Label L;
 6000     jccb(negate_condition(cc), L);
 6001     movl(dst, src);
 6002     bind(L);
 6003   }
 6004 }
 6005 
 6006 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
 6007   if (VM_Version::supports_cmov()) {
 6008     cmovl(cc, dst, src);
 6009   } else {
 6010     Label L;
 6011     jccb(negate_condition(cc), L);
 6012     movl(dst, src);
 6013     bind(L);
 6014   }
 6015 }
 6016 
 6017 void MacroAssembler::verify_oop(Register reg, const char* s) {
 6018   if (!VerifyOops) return;
 6019 
 6020   // Pass register number to verify_oop_subroutine
 6021   const char* b = NULL;
 6022   {
 6023     ResourceMark rm;
 6024     stringStream ss;
 6025     ss.print("verify_oop: %s: %s", reg->name(), s);
 6026     b = code_string(ss.as_string());
 6027   }
 6028   BLOCK_COMMENT("verify_oop {");
 6029 #ifdef _LP64
 6030   push(rscratch1);                    // save r10, trashed by movptr()
 6031 #endif
 6032   push(rax);                          // save rax,
 6033   push(reg);                          // pass register argument
 6034   ExternalAddress buffer((address) b);
 6035   // avoid using pushptr, as it modifies scratch registers
 6036   // and our contract is not to modify anything
 6037   movptr(rax, buffer.addr());
 6038   push(rax);
 6039   // call indirectly to solve generation ordering problem
 6040   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
 6041   call(rax);
 6042   // Caller pops the arguments (oop, message) and restores rax, r10
 6043   BLOCK_COMMENT("} verify_oop");
 6044 }
 6045 
 6046 
 6047 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 6048                                                       Register tmp,
 6049                                                       int offset) {
 6050   intptr_t value = *delayed_value_addr;
 6051   if (value != 0)
 6052     return RegisterOrConstant(value + offset);
 6053 
 6054   // load indirectly to solve generation ordering problem
 6055   movptr(tmp, ExternalAddress((address) delayed_value_addr));
 6056 
 6057 #ifdef ASSERT
 6058   { Label L;
 6059     testptr(tmp, tmp);
 6060     if (WizardMode) {
 6061       const char* buf = NULL;
 6062       {
 6063         ResourceMark rm;
 6064         stringStream ss;
 6065         ss.print("DelayedValue=" INTPTR_FORMAT, delayed_value_addr[1]);
 6066         buf = code_string(ss.as_string());
 6067       }
 6068       jcc(Assembler::notZero, L);
 6069       STOP(buf);
 6070     } else {
 6071       jccb(Assembler::notZero, L);
 6072       hlt();
 6073     }
 6074     bind(L);
 6075   }
 6076 #endif
 6077 
 6078   if (offset != 0)
 6079     addptr(tmp, offset);
 6080 
 6081   return RegisterOrConstant(tmp);
 6082 }
 6083 
 6084 
 6085 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
 6086                                          int extra_slot_offset) {
 6087   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
 6088   int stackElementSize = Interpreter::stackElementSize;
 6089   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
 6090 #ifdef ASSERT
 6091   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
 6092   assert(offset1 - offset == stackElementSize, "correct arithmetic");
 6093 #endif
 6094   Register             scale_reg    = noreg;
 6095   Address::ScaleFactor scale_factor = Address::no_scale;
 6096   if (arg_slot.is_constant()) {
 6097     offset += arg_slot.as_constant() * stackElementSize;
 6098   } else {
 6099     scale_reg    = arg_slot.as_register();
 6100     scale_factor = Address::times(stackElementSize);
 6101   }
 6102   offset += wordSize;           // return PC is on stack
 6103   return Address(rsp, scale_reg, scale_factor, offset);
 6104 }
 6105 
 6106 
 6107 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
 6108   if (!VerifyOops) return;
 6109 
 6110   // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
 6111   // Pass register number to verify_oop_subroutine
 6112   const char* b = NULL;
 6113   {
 6114     ResourceMark rm;
 6115     stringStream ss;
 6116     ss.print("verify_oop_addr: %s", s);
 6117     b = code_string(ss.as_string());
 6118   }
 6119 #ifdef _LP64
 6120   push(rscratch1);                    // save r10, trashed by movptr()
 6121 #endif
 6122   push(rax);                          // save rax,
 6123   // addr may contain rsp so we will have to adjust it based on the push
 6124   // we just did (and on 64 bit we do two pushes)
 6125   // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
 6126   // stores rax into addr which is backwards of what was intended.
 6127   if (addr.uses(rsp)) {
 6128     lea(rax, addr);
 6129     pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
 6130   } else {
 6131     pushptr(addr);
 6132   }
 6133 
 6134   ExternalAddress buffer((address) b);
 6135   // pass msg argument
 6136   // avoid using pushptr, as it modifies scratch registers
 6137   // and our contract is not to modify anything
 6138   movptr(rax, buffer.addr());
 6139   push(rax);
 6140 
 6141   // call indirectly to solve generation ordering problem
 6142   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
 6143   call(rax);
 6144   // Caller pops the arguments (addr, message) and restores rax, r10.
 6145 }
 6146 
 6147 void MacroAssembler::verify_tlab() {
 6148 #ifdef ASSERT
 6149   if (UseTLAB && VerifyOops) {
 6150     Label next, ok;
 6151     Register t1 = rsi;
 6152     Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
 6153 
 6154     push(t1);
 6155     NOT_LP64(push(thread_reg));
 6156     NOT_LP64(get_thread(thread_reg));
 6157 
 6158     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
 6159     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
 6160     jcc(Assembler::aboveEqual, next);
 6161     STOP("assert(top >= start)");
 6162     should_not_reach_here();
 6163 
 6164     bind(next);
 6165     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
 6166     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
 6167     jcc(Assembler::aboveEqual, ok);
 6168     STOP("assert(top <= end)");
 6169     should_not_reach_here();
 6170 
 6171     bind(ok);
 6172     NOT_LP64(pop(thread_reg));
 6173     pop(t1);
 6174   }
 6175 #endif
 6176 }
 6177 
 6178 class ControlWord {
 6179  public:
 6180   int32_t _value;
 6181 
 6182   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
 6183   int  precision_control() const       { return  (_value >>  8) & 3      ; }
 6184   bool precision() const               { return ((_value >>  5) & 1) != 0; }
 6185   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
 6186   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
 6187   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
 6188   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
 6189   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
 6190 
 6191   void print() const {
 6192     // rounding control
 6193     const char* rc;
 6194     switch (rounding_control()) {
 6195       case 0: rc = "round near"; break;
 6196       case 1: rc = "round down"; break;
 6197       case 2: rc = "round up  "; break;
 6198       case 3: rc = "chop      "; break;
 6199     };
 6200     // precision control
 6201     const char* pc;
 6202     switch (precision_control()) {
 6203       case 0: pc = "24 bits "; break;
 6204       case 1: pc = "reserved"; break;
 6205       case 2: pc = "53 bits "; break;
 6206       case 3: pc = "64 bits "; break;
 6207     };
 6208     // flags
 6209     char f[9];
 6210     f[0] = ' ';
 6211     f[1] = ' ';
 6212     f[2] = (precision   ()) ? 'P' : 'p';
 6213     f[3] = (underflow   ()) ? 'U' : 'u';
 6214     f[4] = (overflow    ()) ? 'O' : 'o';
 6215     f[5] = (zero_divide ()) ? 'Z' : 'z';
 6216     f[6] = (denormalized()) ? 'D' : 'd';
 6217     f[7] = (invalid     ()) ? 'I' : 'i';
 6218     f[8] = '\x0';
 6219     // output
 6220     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
 6221   }
 6222 
 6223 };
 6224 
 6225 class StatusWord {
 6226  public:
 6227   int32_t _value;
 6228 
 6229   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
 6230   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
 6231   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
 6232   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
 6233   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
 6234   int  top() const                     { return  (_value >> 11) & 7      ; }
 6235   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
 6236   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
 6237   bool precision() const               { return ((_value >>  5) & 1) != 0; }
 6238   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
 6239   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
 6240   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
 6241   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
 6242   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
 6243 
 6244   void print() const {
 6245     // condition codes
 6246     char c[5];
 6247     c[0] = (C3()) ? '3' : '-';
 6248     c[1] = (C2()) ? '2' : '-';
 6249     c[2] = (C1()) ? '1' : '-';
 6250     c[3] = (C0()) ? '0' : '-';
 6251     c[4] = '\x0';
 6252     // flags
 6253     char f[9];
 6254     f[0] = (error_status()) ? 'E' : '-';
 6255     f[1] = (stack_fault ()) ? 'S' : '-';
 6256     f[2] = (precision   ()) ? 'P' : '-';
 6257     f[3] = (underflow   ()) ? 'U' : '-';
 6258     f[4] = (overflow    ()) ? 'O' : '-';
 6259     f[5] = (zero_divide ()) ? 'Z' : '-';
 6260     f[6] = (denormalized()) ? 'D' : '-';
 6261     f[7] = (invalid     ()) ? 'I' : '-';
 6262     f[8] = '\x0';
 6263     // output
 6264     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
 6265   }
 6266 
 6267 };
 6268 
 6269 class TagWord {
 6270  public:
 6271   int32_t _value;
 6272 
 6273   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
 6274 
 6275   void print() const {
 6276     printf("%04x", _value & 0xFFFF);
 6277   }
 6278 
 6279 };
 6280 
 6281 class FPU_Register {
 6282  public:
 6283   int32_t _m0;
 6284   int32_t _m1;
 6285   int16_t _ex;
 6286 
 6287   bool is_indefinite() const           {
 6288     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
 6289   }
 6290 
 6291   void print() const {
 6292     char  sign = (_ex < 0) ? '-' : '+';
 6293     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
 6294     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
 6295   };
 6296 
 6297 };
 6298 
 6299 class FPU_State {
 6300  public:
 6301   enum {
 6302     register_size       = 10,
 6303     number_of_registers =  8,
 6304     register_mask       =  7
 6305   };
 6306 
 6307   ControlWord  _control_word;
 6308   StatusWord   _status_word;
 6309   TagWord      _tag_word;
 6310   int32_t      _error_offset;
 6311   int32_t      _error_selector;
 6312   int32_t      _data_offset;
 6313   int32_t      _data_selector;
 6314   int8_t       _register[register_size * number_of_registers];
 6315 
 6316   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
 6317   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
 6318 
 6319   const char* tag_as_string(int tag) const {
 6320     switch (tag) {
 6321       case 0: return "valid";
 6322       case 1: return "zero";
 6323       case 2: return "special";
 6324       case 3: return "empty";
 6325     }
 6326     ShouldNotReachHere();
 6327     return NULL;
 6328   }
 6329 
 6330   void print() const {
 6331     // print computation registers
 6332     { int t = _status_word.top();
 6333       for (int i = 0; i < number_of_registers; i++) {
 6334         int j = (i - t) & register_mask;
 6335         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
 6336         st(j)->print();
 6337         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
 6338       }
 6339     }
 6340     printf("\n");
 6341     // print control registers
 6342     printf("ctrl = "); _control_word.print(); printf("\n");
 6343     printf("stat = "); _status_word .print(); printf("\n");
 6344     printf("tags = "); _tag_word    .print(); printf("\n");
 6345   }
 6346 
 6347 };
 6348 
 6349 class Flag_Register {
 6350  public:
 6351   int32_t _value;
 6352 
 6353   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
 6354   bool direction() const               { return ((_value >> 10) & 1) != 0; }
 6355   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
 6356   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
 6357   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
 6358   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
 6359   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
 6360 
 6361   void print() const {
 6362     // flags
 6363     char f[8];
 6364     f[0] = (overflow       ()) ? 'O' : '-';
 6365     f[1] = (direction      ()) ? 'D' : '-';
 6366     f[2] = (sign           ()) ? 'S' : '-';
 6367     f[3] = (zero           ()) ? 'Z' : '-';
 6368     f[4] = (auxiliary_carry()) ? 'A' : '-';
 6369     f[5] = (parity         ()) ? 'P' : '-';
 6370     f[6] = (carry          ()) ? 'C' : '-';
 6371     f[7] = '\x0';
 6372     // output
 6373     printf("%08x  flags = %s", _value, f);
 6374   }
 6375 
 6376 };
 6377 
 6378 class IU_Register {
 6379  public:
 6380   int32_t _value;
 6381 
 6382   void print() const {
 6383     printf("%08x  %11d", _value, _value);
 6384   }
 6385 
 6386 };
 6387 
 6388 class IU_State {
 6389  public:
 6390   Flag_Register _eflags;
 6391   IU_Register   _rdi;
 6392   IU_Register   _rsi;
 6393   IU_Register   _rbp;
 6394   IU_Register   _rsp;
 6395   IU_Register   _rbx;
 6396   IU_Register   _rdx;
 6397   IU_Register   _rcx;
 6398   IU_Register   _rax;
 6399 
 6400   void print() const {
 6401     // computation registers
 6402     printf("rax,  = "); _rax.print(); printf("\n");
 6403     printf("rbx,  = "); _rbx.print(); printf("\n");
 6404     printf("rcx  = "); _rcx.print(); printf("\n");
 6405     printf("rdx  = "); _rdx.print(); printf("\n");
 6406     printf("rdi  = "); _rdi.print(); printf("\n");
 6407     printf("rsi  = "); _rsi.print(); printf("\n");
 6408     printf("rbp,  = "); _rbp.print(); printf("\n");
 6409     printf("rsp  = "); _rsp.print(); printf("\n");
 6410     printf("\n");
 6411     // control registers
 6412     printf("flgs = "); _eflags.print(); printf("\n");
 6413   }
 6414 };
 6415 
 6416 
 6417 class CPU_State {
 6418  public:
 6419   FPU_State _fpu_state;
 6420   IU_State  _iu_state;
 6421 
 6422   void print() const {
 6423     printf("--------------------------------------------------\n");
 6424     _iu_state .print();
 6425     printf("\n");
 6426     _fpu_state.print();
 6427     printf("--------------------------------------------------\n");
 6428   }
 6429 
 6430 };
 6431 
 6432 
 6433 static void _print_CPU_state(CPU_State* state) {
 6434   state->print();
 6435 };
 6436 
 6437 
 6438 void MacroAssembler::print_CPU_state() {
 6439   push_CPU_state();
 6440   push(rsp);                // pass CPU state
 6441   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
 6442   addptr(rsp, wordSize);       // discard argument
 6443   pop_CPU_state();
 6444 }
 6445 
 6446 
 6447 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
 6448   static int counter = 0;
 6449   FPU_State* fs = &state->_fpu_state;
 6450   counter++;
 6451   // For leaf calls, only verify that the top few elements remain empty.
 6452   // We only need 1 empty at the top for C2 code.
 6453   if( stack_depth < 0 ) {
 6454     if( fs->tag_for_st(7) != 3 ) {
 6455       printf("FPR7 not empty\n");
 6456       state->print();
 6457       assert(false, "error");
 6458       return false;
 6459     }
 6460     return true;                // All other stack states do not matter
 6461   }
 6462 
 6463   assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
 6464          "bad FPU control word");
 6465 
 6466   // compute stack depth
 6467   int i = 0;
 6468   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
 6469   int d = i;
 6470   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
 6471   // verify findings
 6472   if (i != FPU_State::number_of_registers) {
 6473     // stack not contiguous
 6474     printf("%s: stack not contiguous at ST%d\n", s, i);
 6475     state->print();
 6476     assert(false, "error");
 6477     return false;
 6478   }
 6479   // check if computed stack depth corresponds to expected stack depth
 6480   if (stack_depth < 0) {
 6481     // expected stack depth is -stack_depth or less
 6482     if (d > -stack_depth) {
 6483       // too many elements on the stack
 6484       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
 6485       state->print();
 6486       assert(false, "error");
 6487       return false;
 6488     }
 6489   } else {
 6490     // expected stack depth is stack_depth
 6491     if (d != stack_depth) {
 6492       // wrong stack depth
 6493       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
 6494       state->print();
 6495       assert(false, "error");
 6496       return false;
 6497     }
 6498   }
 6499   // everything is cool
 6500   return true;
 6501 }
 6502 
 6503 
 6504 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
 6505   if (!VerifyFPU) return;
 6506   push_CPU_state();
 6507   push(rsp);                // pass CPU state
 6508   ExternalAddress msg((address) s);
 6509   // pass message string s
 6510   pushptr(msg.addr());
 6511   push(stack_depth);        // pass stack depth
 6512   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
 6513   addptr(rsp, 3 * wordSize);   // discard arguments
 6514   // check for error
 6515   { Label L;
 6516     testl(rax, rax);
 6517     jcc(Assembler::notZero, L);
 6518     int3();                  // break if error condition
 6519     bind(L);
 6520   }
 6521   pop_CPU_state();
 6522 }
 6523 
 6524 void MacroAssembler::restore_cpu_control_state_after_jni() {
 6525   // Either restore the MXCSR register after returning from the JNI Call
 6526   // or verify that it wasn't changed (with -Xcheck:jni flag).
 6527   if (VM_Version::supports_sse()) {
 6528     if (RestoreMXCSROnJNICalls) {
 6529       ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
 6530     } else if (CheckJNICalls) {
 6531       call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
 6532     }
 6533   }
 6534   // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
 6535   vzeroupper();
 6536   // Reset k1 to 0xffff.
 6537   if (VM_Version::supports_evex()) {
 6538     push(rcx);
 6539     movl(rcx, 0xffff);
 6540     kmovwl(k1, rcx);
 6541     pop(rcx);
 6542   }
 6543 
 6544 #ifndef _LP64
 6545   // Either restore the x87 floating pointer control word after returning
 6546   // from the JNI call or verify that it wasn't changed.
 6547   if (CheckJNICalls) {
 6548     call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
 6549   }
 6550 #endif // _LP64
 6551 }
 6552 
 6553 // ((OopHandle)result).resolve();
 6554 void MacroAssembler::resolve_oop_handle(Register result) {
 6555   // OopHandle::resolve is an indirection.
 6556   movptr(result, Address(result, 0));
 6557 }
 6558 
 6559 void MacroAssembler::load_mirror(Register mirror, Register method) {
 6560   // get mirror
 6561   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
 6562   movptr(mirror, Address(method, Method::const_offset()));
 6563   movptr(mirror, Address(mirror, ConstMethod::constants_offset()));
 6564   movptr(mirror, Address(mirror, ConstantPool::pool_holder_offset_in_bytes()));
 6565   movptr(mirror, Address(mirror, mirror_offset));
 6566   resolve_oop_handle(mirror);
 6567 }
 6568 
 6569 void MacroAssembler::load_klass(Register dst, Register src) {
 6570 #ifdef _LP64
 6571   if (UseCompressedClassPointers) {
 6572     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
 6573     decode_klass_not_null(dst);
 6574   } else
 6575 #endif
 6576     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
 6577 }
 6578 
 6579 void MacroAssembler::load_prototype_header(Register dst, Register src) {
 6580   load_klass(dst, src);
 6581   movptr(dst, Address(dst, Klass::prototype_header_offset()));
 6582 }
 6583 
 6584 void MacroAssembler::store_klass(Register dst, Register src) {
 6585 #ifdef _LP64
 6586   if (UseCompressedClassPointers) {
 6587     encode_klass_not_null(src);
 6588     movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
 6589   } else
 6590 #endif
 6591     movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
 6592 }
 6593 
 6594 void MacroAssembler::load_heap_oop(Register dst, Address src) {
 6595 #ifdef _LP64
 6596   // FIXME: Must change all places where we try to load the klass.
 6597   if (UseCompressedOops) {
 6598     movl(dst, src);
 6599     decode_heap_oop(dst);
 6600   } else
 6601 #endif
 6602     movptr(dst, src);
 6603 }
 6604 
 6605 // Doesn't do verfication, generates fixed size code
 6606 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src) {
 6607 #ifdef _LP64
 6608   if (UseCompressedOops) {
 6609     movl(dst, src);
 6610     decode_heap_oop_not_null(dst);
 6611   } else
 6612 #endif
 6613     movptr(dst, src);
 6614 }
 6615 
 6616 void MacroAssembler::store_heap_oop(Address dst, Register src) {
 6617 #ifdef _LP64
 6618   if (UseCompressedOops) {
 6619     assert(!dst.uses(src), "not enough registers");
 6620     encode_heap_oop(src);
 6621     movl(dst, src);
 6622   } else
 6623 #endif
 6624     movptr(dst, src);
 6625 }
 6626 
 6627 void MacroAssembler::cmp_heap_oop(Register src1, Address src2, Register tmp) {
 6628   assert_different_registers(src1, tmp);
 6629 #ifdef _LP64
 6630   if (UseCompressedOops) {
 6631     bool did_push = false;
 6632     if (tmp == noreg) {
 6633       tmp = rax;
 6634       push(tmp);
 6635       did_push = true;
 6636       assert(!src2.uses(rsp), "can't push");
 6637     }
 6638     load_heap_oop(tmp, src2);
 6639     cmpptr(src1, tmp);
 6640     if (did_push)  pop(tmp);
 6641   } else
 6642 #endif
 6643     cmpptr(src1, src2);
 6644 }
 6645 
 6646 // Used for storing NULLs.
 6647 void MacroAssembler::store_heap_oop_null(Address dst) {
 6648 #ifdef _LP64
 6649   if (UseCompressedOops) {
 6650     movl(dst, (int32_t)NULL_WORD);
 6651   } else {
 6652     movslq(dst, (int32_t)NULL_WORD);
 6653   }
 6654 #else
 6655   movl(dst, (int32_t)NULL_WORD);
 6656 #endif
 6657 }
 6658 
 6659 #ifdef _LP64
 6660 void MacroAssembler::store_klass_gap(Register dst, Register src) {
 6661   if (UseCompressedClassPointers) {
 6662     // Store to klass gap in destination
 6663     movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
 6664   }
 6665 }
 6666 
 6667 #ifdef ASSERT
 6668 void MacroAssembler::verify_heapbase(const char* msg) {
 6669   assert (UseCompressedOops, "should be compressed");
 6670   assert (Universe::heap() != NULL, "java heap should be initialized");
 6671   if (CheckCompressedOops) {
 6672     Label ok;
 6673     push(rscratch1); // cmpptr trashes rscratch1
 6674     cmpptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
 6675     jcc(Assembler::equal, ok);
 6676     STOP(msg);
 6677     bind(ok);
 6678     pop(rscratch1);
 6679   }
 6680 }
 6681 #endif
 6682 
 6683 // Algorithm must match oop.inline.hpp encode_heap_oop.
 6684 void MacroAssembler::encode_heap_oop(Register r) {
 6685 #ifdef ASSERT
 6686   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
 6687 #endif
 6688   verify_oop(r, "broken oop in encode_heap_oop");
 6689   if (Universe::narrow_oop_base() == NULL) {
 6690     if (Universe::narrow_oop_shift() != 0) {
 6691       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
 6692       shrq(r, LogMinObjAlignmentInBytes);
 6693     }
 6694     return;
 6695   }
 6696   testq(r, r);
 6697   cmovq(Assembler::equal, r, r12_heapbase);
 6698   subq(r, r12_heapbase);
 6699   shrq(r, LogMinObjAlignmentInBytes);
 6700 }
 6701 
 6702 void MacroAssembler::encode_heap_oop_not_null(Register r) {
 6703 #ifdef ASSERT
 6704   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
 6705   if (CheckCompressedOops) {
 6706     Label ok;
 6707     testq(r, r);
 6708     jcc(Assembler::notEqual, ok);
 6709     STOP("null oop passed to encode_heap_oop_not_null");
 6710     bind(ok);
 6711   }
 6712 #endif
 6713   verify_oop(r, "broken oop in encode_heap_oop_not_null");
 6714   if (Universe::narrow_oop_base() != NULL) {
 6715     subq(r, r12_heapbase);
 6716   }
 6717   if (Universe::narrow_oop_shift() != 0) {
 6718     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
 6719     shrq(r, LogMinObjAlignmentInBytes);
 6720   }
 6721 }
 6722 
 6723 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
 6724 #ifdef ASSERT
 6725   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
 6726   if (CheckCompressedOops) {
 6727     Label ok;
 6728     testq(src, src);
 6729     jcc(Assembler::notEqual, ok);
 6730     STOP("null oop passed to encode_heap_oop_not_null2");
 6731     bind(ok);
 6732   }
 6733 #endif
 6734   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
 6735   if (dst != src) {
 6736     movq(dst, src);
 6737   }
 6738   if (Universe::narrow_oop_base() != NULL) {
 6739     subq(dst, r12_heapbase);
 6740   }
 6741   if (Universe::narrow_oop_shift() != 0) {
 6742     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
 6743     shrq(dst, LogMinObjAlignmentInBytes);
 6744   }
 6745 }
 6746 
 6747 void  MacroAssembler::decode_heap_oop(Register r) {
 6748 #ifdef ASSERT
 6749   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
 6750 #endif
 6751   if (Universe::narrow_oop_base() == NULL) {
 6752     if (Universe::narrow_oop_shift() != 0) {
 6753       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
 6754       shlq(r, LogMinObjAlignmentInBytes);
 6755     }
 6756   } else {
 6757     Label done;
 6758     shlq(r, LogMinObjAlignmentInBytes);
 6759     jccb(Assembler::equal, done);
 6760     addq(r, r12_heapbase);
 6761     bind(done);
 6762   }
 6763   verify_oop(r, "broken oop in decode_heap_oop");
 6764 }
 6765 
 6766 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
 6767   // Note: it will change flags
 6768   assert (UseCompressedOops, "should only be used for compressed headers");
 6769   assert (Universe::heap() != NULL, "java heap should be initialized");
 6770   // Cannot assert, unverified entry point counts instructions (see .ad file)
 6771   // vtableStubs also counts instructions in pd_code_size_limit.
 6772   // Also do not verify_oop as this is called by verify_oop.
 6773   if (Universe::narrow_oop_shift() != 0) {
 6774     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
 6775     shlq(r, LogMinObjAlignmentInBytes);
 6776     if (Universe::narrow_oop_base() != NULL) {
 6777       addq(r, r12_heapbase);
 6778     }
 6779   } else {
 6780     assert (Universe::narrow_oop_base() == NULL, "sanity");
 6781   }
 6782 }
 6783 
 6784 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
 6785   // Note: it will change flags
 6786   assert (UseCompressedOops, "should only be used for compressed headers");
 6787   assert (Universe::heap() != NULL, "java heap should be initialized");
 6788   // Cannot assert, unverified entry point counts instructions (see .ad file)
 6789   // vtableStubs also counts instructions in pd_code_size_limit.
 6790   // Also do not verify_oop as this is called by verify_oop.
 6791   if (Universe::narrow_oop_shift() != 0) {
 6792     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
 6793     if (LogMinObjAlignmentInBytes == Address::times_8) {
 6794       leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
 6795     } else {
 6796       if (dst != src) {
 6797         movq(dst, src);
 6798       }
 6799       shlq(dst, LogMinObjAlignmentInBytes);
 6800       if (Universe::narrow_oop_base() != NULL) {
 6801         addq(dst, r12_heapbase);
 6802       }
 6803     }
 6804   } else {
 6805     assert (Universe::narrow_oop_base() == NULL, "sanity");
 6806     if (dst != src) {
 6807       movq(dst, src);
 6808     }
 6809   }
 6810 }
 6811 
 6812 void MacroAssembler::encode_klass_not_null(Register r) {
 6813   if (Universe::narrow_klass_base() != NULL) {
 6814     // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
 6815     assert(r != r12_heapbase, "Encoding a klass in r12");
 6816     mov64(r12_heapbase, (int64_t)Universe::narrow_klass_base());
 6817     subq(r, r12_heapbase);
 6818   }
 6819   if (Universe::narrow_klass_shift() != 0) {
 6820     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
 6821     shrq(r, LogKlassAlignmentInBytes);
 6822   }
 6823   if (Universe::narrow_klass_base() != NULL) {
 6824     reinit_heapbase();
 6825   }
 6826 }
 6827 
 6828 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
 6829   if (dst == src) {
 6830     encode_klass_not_null(src);
 6831   } else {
 6832     if (Universe::narrow_klass_base() != NULL) {
 6833       mov64(dst, (int64_t)Universe::narrow_klass_base());
 6834       negq(dst);
 6835       addq(dst, src);
 6836     } else {
 6837       movptr(dst, src);
 6838     }
 6839     if (Universe::narrow_klass_shift() != 0) {
 6840       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
 6841       shrq(dst, LogKlassAlignmentInBytes);
 6842     }
 6843   }
 6844 }
 6845 
 6846 // Function instr_size_for_decode_klass_not_null() counts the instructions
 6847 // generated by decode_klass_not_null(register r) and reinit_heapbase(),
 6848 // when (Universe::heap() != NULL).  Hence, if the instructions they
 6849 // generate change, then this method needs to be updated.
 6850 int MacroAssembler::instr_size_for_decode_klass_not_null() {
 6851   assert (UseCompressedClassPointers, "only for compressed klass ptrs");
 6852   if (Universe::narrow_klass_base() != NULL) {
 6853     // mov64 + addq + shlq? + mov64  (for reinit_heapbase()).
 6854     return (Universe::narrow_klass_shift() == 0 ? 20 : 24);
 6855   } else {
 6856     // longest load decode klass function, mov64, leaq
 6857     return 16;
 6858   }
 6859 }
 6860 
 6861 // !!! If the instructions that get generated here change then function
 6862 // instr_size_for_decode_klass_not_null() needs to get updated.
 6863 void  MacroAssembler::decode_klass_not_null(Register r) {
 6864   // Note: it will change flags
 6865   assert (UseCompressedClassPointers, "should only be used for compressed headers");
 6866   assert(r != r12_heapbase, "Decoding a klass in r12");
 6867   // Cannot assert, unverified entry point counts instructions (see .ad file)
 6868   // vtableStubs also counts instructions in pd_code_size_limit.
 6869   // Also do not verify_oop as this is called by verify_oop.
 6870   if (Universe::narrow_klass_shift() != 0) {
 6871     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
 6872     shlq(r, LogKlassAlignmentInBytes);
 6873   }
 6874   // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
 6875   if (Universe::narrow_klass_base() != NULL) {
 6876     mov64(r12_heapbase, (int64_t)Universe::narrow_klass_base());
 6877     addq(r, r12_heapbase);
 6878     reinit_heapbase();
 6879   }
 6880 }
 6881 
 6882 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
 6883   // Note: it will change flags
 6884   assert (UseCompressedClassPointers, "should only be used for compressed headers");
 6885   if (dst == src) {
 6886     decode_klass_not_null(dst);
 6887   } else {
 6888     // Cannot assert, unverified entry point counts instructions (see .ad file)
 6889     // vtableStubs also counts instructions in pd_code_size_limit.
 6890     // Also do not verify_oop as this is called by verify_oop.
 6891     mov64(dst, (int64_t)Universe::narrow_klass_base());
 6892     if (Universe::narrow_klass_shift() != 0) {
 6893       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
 6894       assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
 6895       leaq(dst, Address(dst, src, Address::times_8, 0));
 6896     } else {
 6897       addq(dst, src);
 6898     }
 6899   }
 6900 }
 6901 
 6902 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
 6903   assert (UseCompressedOops, "should only be used for compressed headers");
 6904   assert (Universe::heap() != NULL, "java heap should be initialized");
 6905   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
 6906   int oop_index = oop_recorder()->find_index(obj);
 6907   RelocationHolder rspec = oop_Relocation::spec(oop_index);
 6908   mov_narrow_oop(dst, oop_index, rspec);
 6909 }
 6910 
 6911 void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
 6912   assert (UseCompressedOops, "should only be used for compressed headers");
 6913   assert (Universe::heap() != NULL, "java heap should be initialized");
 6914   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
 6915   int oop_index = oop_recorder()->find_index(obj);
 6916   RelocationHolder rspec = oop_Relocation::spec(oop_index);
 6917   mov_narrow_oop(dst, oop_index, rspec);
 6918 }
 6919 
 6920 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
 6921   assert (UseCompressedClassPointers, "should only be used for compressed headers");
 6922   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
 6923   int klass_index = oop_recorder()->find_index(k);
 6924   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
 6925   mov_narrow_oop(dst, Klass::encode_klass(k), rspec);
 6926 }
 6927 
 6928 void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
 6929   assert (UseCompressedClassPointers, "should only be used for compressed headers");
 6930   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
 6931   int klass_index = oop_recorder()->find_index(k);
 6932   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
 6933   mov_narrow_oop(dst, Klass::encode_klass(k), rspec);
 6934 }
 6935 
 6936 void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
 6937   assert (UseCompressedOops, "should only be used for compressed headers");
 6938   assert (Universe::heap() != NULL, "java heap should be initialized");
 6939   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
 6940   int oop_index = oop_recorder()->find_index(obj);
 6941   RelocationHolder rspec = oop_Relocation::spec(oop_index);
 6942   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
 6943 }
 6944 
 6945 void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
 6946   assert (UseCompressedOops, "should only be used for compressed headers");
 6947   assert (Universe::heap() != NULL, "java heap should be initialized");
 6948   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
 6949   int oop_index = oop_recorder()->find_index(obj);
 6950   RelocationHolder rspec = oop_Relocation::spec(oop_index);
 6951   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
 6952 }
 6953 
 6954 void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
 6955   assert (UseCompressedClassPointers, "should only be used for compressed headers");
 6956   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
 6957   int klass_index = oop_recorder()->find_index(k);
 6958   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
 6959   Assembler::cmp_narrow_oop(dst, Klass::encode_klass(k), rspec);
 6960 }
 6961 
 6962 void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
 6963   assert (UseCompressedClassPointers, "should only be used for compressed headers");
 6964   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
 6965   int klass_index = oop_recorder()->find_index(k);
 6966   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
 6967   Assembler::cmp_narrow_oop(dst, Klass::encode_klass(k), rspec);
 6968 }
 6969 
 6970 void MacroAssembler::reinit_heapbase() {
 6971   if (UseCompressedOops || UseCompressedClassPointers) {
 6972     if (Universe::heap() != NULL) {
 6973       if (Universe::narrow_oop_base() == NULL) {
 6974         MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
 6975       } else {
 6976         mov64(r12_heapbase, (int64_t)Universe::narrow_ptrs_base());
 6977       }
 6978     } else {
 6979       movptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
 6980     }
 6981   }
 6982 }
 6983 
 6984 #endif // _LP64
 6985 
 6986 // C2 compiled method's prolog code.
 6987 void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b) {
 6988 
 6989   // WARNING: Initial instruction MUST be 5 bytes or longer so that
 6990   // NativeJump::patch_verified_entry will be able to patch out the entry
 6991   // code safely. The push to verify stack depth is ok at 5 bytes,
 6992   // the frame allocation can be either 3 or 6 bytes. So if we don't do
 6993   // stack bang then we must use the 6 byte frame allocation even if
 6994   // we have no frame. :-(
 6995   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
 6996 
 6997   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
 6998   // Remove word for return addr
 6999   framesize -= wordSize;
 7000   stack_bang_size -= wordSize;
 7001 
 7002   // Calls to C2R adapters often do not accept exceptional returns.
 7003   // We require that their callers must bang for them.  But be careful, because
 7004   // some VM calls (such as call site linkage) can use several kilobytes of
 7005   // stack.  But the stack safety zone should account for that.
 7006   // See bugs 4446381, 4468289, 4497237.
 7007   if (stack_bang_size > 0) {
 7008     generate_stack_overflow_check(stack_bang_size);
 7009 
 7010     // We always push rbp, so that on return to interpreter rbp, will be
 7011     // restored correctly and we can correct the stack.
 7012     push(rbp);
 7013     // Save caller's stack pointer into RBP if the frame pointer is preserved.
 7014     if (PreserveFramePointer) {
 7015       mov(rbp, rsp);
 7016     }
 7017     // Remove word for ebp
 7018     framesize -= wordSize;
 7019 
 7020     // Create frame
 7021     if (framesize) {
 7022       subptr(rsp, framesize);
 7023     }
 7024   } else {
 7025     // Create frame (force generation of a 4 byte immediate value)
 7026     subptr_imm32(rsp, framesize);
 7027 
 7028     // Save RBP register now.
 7029     framesize -= wordSize;
 7030     movptr(Address(rsp, framesize), rbp);
 7031     // Save caller's stack pointer into RBP if the frame pointer is preserved.
 7032     if (PreserveFramePointer) {
 7033       movptr(rbp, rsp);
 7034       if (framesize > 0) {
 7035         addptr(rbp, framesize);
 7036       }
 7037     }
 7038   }
 7039 
 7040   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
 7041     framesize -= wordSize;
 7042     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 7043   }
 7044 
 7045 #ifndef _LP64
 7046   // If method sets FPU control word do it now
 7047   if (fp_mode_24b) {
 7048     fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
 7049   }
 7050   if (UseSSE >= 2 && VerifyFPU) {
 7051     verify_FPU(0, "FPU stack must be clean on entry");
 7052   }
 7053 #endif
 7054 
 7055 #ifdef ASSERT
 7056   if (VerifyStackAtCalls) {
 7057     Label L;
 7058     push(rax);
 7059     mov(rax, rsp);
 7060     andptr(rax, StackAlignmentInBytes-1);
 7061     cmpptr(rax, StackAlignmentInBytes-wordSize);
 7062     pop(rax);
 7063     jcc(Assembler::equal, L);
 7064     STOP("Stack is not properly aligned!");
 7065     bind(L);
 7066   }
 7067 #endif
 7068 
 7069 }
 7070 
 7071 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, bool is_large) {
 7072   // cnt - number of qwords (8-byte words).
 7073   // base - start address, qword aligned.
 7074   // is_large - if optimizers know cnt is larger than InitArrayShortSize
 7075   assert(base==rdi, "base register must be edi for rep stos");
 7076   assert(tmp==rax,   "tmp register must be eax for rep stos");
 7077   assert(cnt==rcx,   "cnt register must be ecx for rep stos");
 7078   assert(InitArrayShortSize % BytesPerLong == 0,
 7079     "InitArrayShortSize should be the multiple of BytesPerLong");
 7080 
 7081   Label DONE;
 7082 
 7083   xorptr(tmp, tmp);
 7084 
 7085   if (!is_large) {
 7086     Label LOOP, LONG;
 7087     cmpptr(cnt, InitArrayShortSize/BytesPerLong);
 7088     jccb(Assembler::greater, LONG);
 7089 
 7090     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
 7091 
 7092     decrement(cnt);
 7093     jccb(Assembler::negative, DONE); // Zero length
 7094 
 7095     // Use individual pointer-sized stores for small counts:
 7096     BIND(LOOP);
 7097     movptr(Address(base, cnt, Address::times_ptr), tmp);
 7098     decrement(cnt);
 7099     jccb(Assembler::greaterEqual, LOOP);
 7100     jmpb(DONE);
 7101 
 7102     BIND(LONG);
 7103   }
 7104 
 7105   // Use longer rep-prefixed ops for non-small counts:
 7106   if (UseFastStosb) {
 7107     shlptr(cnt, 3); // convert to number of bytes
 7108     rep_stosb();
 7109   } else {
 7110     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
 7111     rep_stos();
 7112   }
 7113 
 7114   BIND(DONE);
 7115 }
 7116 
 7117 #ifdef COMPILER2
 7118 
 7119 // IndexOf for constant substrings with size >= 8 chars
 7120 // which don't need to be loaded through stack.
 7121 void MacroAssembler::string_indexofC8(Register str1, Register str2,
 7122                                       Register cnt1, Register cnt2,
 7123                                       int int_cnt2,  Register result,
 7124                                       XMMRegister vec, Register tmp,
 7125                                       int ae) {
 7126   ShortBranchVerifier sbv(this);
 7127   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
 7128   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
 7129 
 7130   // This method uses the pcmpestri instruction with bound registers
 7131   //   inputs:
 7132   //     xmm - substring
 7133   //     rax - substring length (elements count)
 7134   //     mem - scanned string
 7135   //     rdx - string length (elements count)
 7136   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
 7137   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
 7138   //   outputs:
 7139   //     rcx - matched index in string
 7140   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
 7141   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
 7142   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
 7143   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
 7144   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
 7145 
 7146   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
 7147         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
 7148         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
 7149 
 7150   // Note, inline_string_indexOf() generates checks:
 7151   // if (substr.count > string.count) return -1;
 7152   // if (substr.count == 0) return 0;
 7153   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
 7154 
 7155   // Load substring.
 7156   if (ae == StrIntrinsicNode::UL) {
 7157     pmovzxbw(vec, Address(str2, 0));
 7158   } else {
 7159     movdqu(vec, Address(str2, 0));
 7160   }
 7161   movl(cnt2, int_cnt2);
 7162   movptr(result, str1); // string addr
 7163 
 7164   if (int_cnt2 > stride) {
 7165     jmpb(SCAN_TO_SUBSTR);
 7166 
 7167     // Reload substr for rescan, this code
 7168     // is executed only for large substrings (> 8 chars)
 7169     bind(RELOAD_SUBSTR);
 7170     if (ae == StrIntrinsicNode::UL) {
 7171       pmovzxbw(vec, Address(str2, 0));
 7172     } else {
 7173       movdqu(vec, Address(str2, 0));
 7174     }
 7175     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
 7176 
 7177     bind(RELOAD_STR);
 7178     // We came here after the beginning of the substring was
 7179     // matched but the rest of it was not so we need to search
 7180     // again. Start from the next element after the previous match.
 7181 
 7182     // cnt2 is number of substring reminding elements and
 7183     // cnt1 is number of string reminding elements when cmp failed.
 7184     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
 7185     subl(cnt1, cnt2);
 7186     addl(cnt1, int_cnt2);
 7187     movl(cnt2, int_cnt2); // Now restore cnt2
 7188 
 7189     decrementl(cnt1);     // Shift to next element
 7190     cmpl(cnt1, cnt2);
 7191     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
 7192 
 7193     addptr(result, (1<<scale1));
 7194 
 7195   } // (int_cnt2 > 8)
 7196 
 7197   // Scan string for start of substr in 16-byte vectors
 7198   bind(SCAN_TO_SUBSTR);
 7199   pcmpestri(vec, Address(result, 0), mode);
 7200   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
 7201   subl(cnt1, stride);
 7202   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
 7203   cmpl(cnt1, cnt2);
 7204   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
 7205   addptr(result, 16);
 7206   jmpb(SCAN_TO_SUBSTR);
 7207 
 7208   // Found a potential substr
 7209   bind(FOUND_CANDIDATE);
 7210   // Matched whole vector if first element matched (tmp(rcx) == 0).
 7211   if (int_cnt2 == stride) {
 7212     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
 7213   } else { // int_cnt2 > 8
 7214     jccb(Assembler::overflow, FOUND_SUBSTR);
 7215   }
 7216   // After pcmpestri tmp(rcx) contains matched element index
 7217   // Compute start addr of substr
 7218   lea(result, Address(result, tmp, scale1));
 7219 
 7220   // Make sure string is still long enough
 7221   subl(cnt1, tmp);
 7222   cmpl(cnt1, cnt2);
 7223   if (int_cnt2 == stride) {
 7224     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
 7225   } else { // int_cnt2 > 8
 7226     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
 7227   }
 7228   // Left less then substring.
 7229 
 7230   bind(RET_NOT_FOUND);
 7231   movl(result, -1);
 7232   jmp(EXIT);
 7233 
 7234   if (int_cnt2 > stride) {
 7235     // This code is optimized for the case when whole substring
 7236     // is matched if its head is matched.
 7237     bind(MATCH_SUBSTR_HEAD);
 7238     pcmpestri(vec, Address(result, 0), mode);
 7239     // Reload only string if does not match
 7240     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
 7241 
 7242     Label CONT_SCAN_SUBSTR;
 7243     // Compare the rest of substring (> 8 chars).
 7244     bind(FOUND_SUBSTR);
 7245     // First 8 chars are already matched.
 7246     negptr(cnt2);
 7247     addptr(cnt2, stride);
 7248 
 7249     bind(SCAN_SUBSTR);
 7250     subl(cnt1, stride);
 7251     cmpl(cnt2, -stride); // Do not read beyond substring
 7252     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
 7253     // Back-up strings to avoid reading beyond substring:
 7254     // cnt1 = cnt1 - cnt2 + 8
 7255     addl(cnt1, cnt2); // cnt2 is negative
 7256     addl(cnt1, stride);
 7257     movl(cnt2, stride); negptr(cnt2);
 7258     bind(CONT_SCAN_SUBSTR);
 7259     if (int_cnt2 < (int)G) {
 7260       int tail_off1 = int_cnt2<<scale1;
 7261       int tail_off2 = int_cnt2<<scale2;
 7262       if (ae == StrIntrinsicNode::UL) {
 7263         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
 7264       } else {
 7265         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
 7266       }
 7267       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
 7268     } else {
 7269       // calculate index in register to avoid integer overflow (int_cnt2*2)
 7270       movl(tmp, int_cnt2);
 7271       addptr(tmp, cnt2);
 7272       if (ae == StrIntrinsicNode::UL) {
 7273         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
 7274       } else {
 7275         movdqu(vec, Address(str2, tmp, scale2, 0));
 7276       }
 7277       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
 7278     }
 7279     // Need to reload strings pointers if not matched whole vector
 7280     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
 7281     addptr(cnt2, stride);
 7282     jcc(Assembler::negative, SCAN_SUBSTR);
 7283     // Fall through if found full substring
 7284 
 7285   } // (int_cnt2 > 8)
 7286 
 7287   bind(RET_FOUND);
 7288   // Found result if we matched full small substring.
 7289   // Compute substr offset
 7290   subptr(result, str1);
 7291   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
 7292     shrl(result, 1); // index
 7293   }
 7294   bind(EXIT);
 7295 
 7296 } // string_indexofC8
 7297 
 7298 // Small strings are loaded through stack if they cross page boundary.
 7299 void MacroAssembler::string_indexof(Register str1, Register str2,
 7300                                     Register cnt1, Register cnt2,
 7301                                     int int_cnt2,  Register result,
 7302                                     XMMRegister vec, Register tmp,
 7303                                     int ae) {
 7304   ShortBranchVerifier sbv(this);
 7305   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
 7306   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
 7307 
 7308   //
 7309   // int_cnt2 is length of small (< 8 chars) constant substring
 7310   // or (-1) for non constant substring in which case its length
 7311   // is in cnt2 register.
 7312   //
 7313   // Note, inline_string_indexOf() generates checks:
 7314   // if (substr.count > string.count) return -1;
 7315   // if (substr.count == 0) return 0;
 7316   //
 7317   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
 7318   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
 7319   // This method uses the pcmpestri instruction with bound registers
 7320   //   inputs:
 7321   //     xmm - substring
 7322   //     rax - substring length (elements count)
 7323   //     mem - scanned string
 7324   //     rdx - string length (elements count)
 7325   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
 7326   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
 7327   //   outputs:
 7328   //     rcx - matched index in string
 7329   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
 7330   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
 7331   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
 7332   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
 7333 
 7334   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
 7335         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
 7336         FOUND_CANDIDATE;
 7337 
 7338   { //========================================================
 7339     // We don't know where these strings are located
 7340     // and we can't read beyond them. Load them through stack.
 7341     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
 7342 
 7343     movptr(tmp, rsp); // save old SP
 7344 
 7345     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
 7346       if (int_cnt2 == (1>>scale2)) { // One byte
 7347         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
 7348         load_unsigned_byte(result, Address(str2, 0));
 7349         movdl(vec, result); // move 32 bits
 7350       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
 7351         // Not enough header space in 32-bit VM: 12+3 = 15.
 7352         movl(result, Address(str2, -1));
 7353         shrl(result, 8);
 7354         movdl(vec, result); // move 32 bits
 7355       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
 7356         load_unsigned_short(result, Address(str2, 0));
 7357         movdl(vec, result); // move 32 bits
 7358       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
 7359         movdl(vec, Address(str2, 0)); // move 32 bits
 7360       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
 7361         movq(vec, Address(str2, 0));  // move 64 bits
 7362       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
 7363         // Array header size is 12 bytes in 32-bit VM
 7364         // + 6 bytes for 3 chars == 18 bytes,
 7365         // enough space to load vec and shift.
 7366         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
 7367         if (ae == StrIntrinsicNode::UL) {
 7368           int tail_off = int_cnt2-8;
 7369           pmovzxbw(vec, Address(str2, tail_off));
 7370           psrldq(vec, -2*tail_off);
 7371         }
 7372         else {
 7373           int tail_off = int_cnt2*(1<<scale2);
 7374           movdqu(vec, Address(str2, tail_off-16));
 7375           psrldq(vec, 16-tail_off);
 7376         }
 7377       }
 7378     } else { // not constant substring
 7379       cmpl(cnt2, stride);
 7380       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
 7381 
 7382       // We can read beyond string if srt+16 does not cross page boundary
 7383       // since heaps are aligned and mapped by pages.
 7384       assert(os::vm_page_size() < (int)G, "default page should be small");
 7385       movl(result, str2); // We need only low 32 bits
 7386       andl(result, (os::vm_page_size()-1));
 7387       cmpl(result, (os::vm_page_size()-16));
 7388       jccb(Assembler::belowEqual, CHECK_STR);
 7389 
 7390       // Move small strings to stack to allow load 16 bytes into vec.
 7391       subptr(rsp, 16);
 7392       int stk_offset = wordSize-(1<<scale2);
 7393       push(cnt2);
 7394 
 7395       bind(COPY_SUBSTR);
 7396       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
 7397         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
 7398         movb(Address(rsp, cnt2, scale2, stk_offset), result);
 7399       } else if (ae == StrIntrinsicNode::UU) {
 7400         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
 7401         movw(Address(rsp, cnt2, scale2, stk_offset), result);
 7402       }
 7403       decrement(cnt2);
 7404       jccb(Assembler::notZero, COPY_SUBSTR);
 7405 
 7406       pop(cnt2);
 7407       movptr(str2, rsp);  // New substring address
 7408     } // non constant
 7409 
 7410     bind(CHECK_STR);
 7411     cmpl(cnt1, stride);
 7412     jccb(Assembler::aboveEqual, BIG_STRINGS);
 7413 
 7414     // Check cross page boundary.
 7415     movl(result, str1); // We need only low 32 bits
 7416     andl(result, (os::vm_page_size()-1));
 7417     cmpl(result, (os::vm_page_size()-16));
 7418     jccb(Assembler::belowEqual, BIG_STRINGS);
 7419 
 7420     subptr(rsp, 16);
 7421     int stk_offset = -(1<<scale1);
 7422     if (int_cnt2 < 0) { // not constant
 7423       push(cnt2);
 7424       stk_offset += wordSize;
 7425     }
 7426     movl(cnt2, cnt1);
 7427 
 7428     bind(COPY_STR);
 7429     if (ae == StrIntrinsicNode::LL) {
 7430       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
 7431       movb(Address(rsp, cnt2, scale1, stk_offset), result);
 7432     } else {
 7433       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
 7434       movw(Address(rsp, cnt2, scale1, stk_offset), result);
 7435     }
 7436     decrement(cnt2);
 7437     jccb(Assembler::notZero, COPY_STR);
 7438 
 7439     if (int_cnt2 < 0) { // not constant
 7440       pop(cnt2);
 7441     }
 7442     movptr(str1, rsp);  // New string address
 7443 
 7444     bind(BIG_STRINGS);
 7445     // Load substring.
 7446     if (int_cnt2 < 0) { // -1
 7447       if (ae == StrIntrinsicNode::UL) {
 7448         pmovzxbw(vec, Address(str2, 0));
 7449       } else {
 7450         movdqu(vec, Address(str2, 0));
 7451       }
 7452       push(cnt2);       // substr count
 7453       push(str2);       // substr addr
 7454       push(str1);       // string addr
 7455     } else {
 7456       // Small (< 8 chars) constant substrings are loaded already.
 7457       movl(cnt2, int_cnt2);
 7458     }
 7459     push(tmp);  // original SP
 7460 
 7461   } // Finished loading
 7462 
 7463   //========================================================
 7464   // Start search
 7465   //
 7466 
 7467   movptr(result, str1); // string addr
 7468 
 7469   if (int_cnt2  < 0) {  // Only for non constant substring
 7470     jmpb(SCAN_TO_SUBSTR);
 7471 
 7472     // SP saved at sp+0
 7473     // String saved at sp+1*wordSize
 7474     // Substr saved at sp+2*wordSize
 7475     // Substr count saved at sp+3*wordSize
 7476 
 7477     // Reload substr for rescan, this code
 7478     // is executed only for large substrings (> 8 chars)
 7479     bind(RELOAD_SUBSTR);
 7480     movptr(str2, Address(rsp, 2*wordSize));
 7481     movl(cnt2, Address(rsp, 3*wordSize));
 7482     if (ae == StrIntrinsicNode::UL) {
 7483       pmovzxbw(vec, Address(str2, 0));
 7484     } else {
 7485       movdqu(vec, Address(str2, 0));
 7486     }
 7487     // We came here after the beginning of the substring was
 7488     // matched but the rest of it was not so we need to search
 7489     // again. Start from the next element after the previous match.
 7490     subptr(str1, result); // Restore counter
 7491     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
 7492       shrl(str1, 1);
 7493     }
 7494     addl(cnt1, str1);
 7495     decrementl(cnt1);   // Shift to next element
 7496     cmpl(cnt1, cnt2);
 7497     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
 7498 
 7499     addptr(result, (1<<scale1));
 7500   } // non constant
 7501 
 7502   // Scan string for start of substr in 16-byte vectors
 7503   bind(SCAN_TO_SUBSTR);
 7504   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
 7505   pcmpestri(vec, Address(result, 0), mode);
 7506   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
 7507   subl(cnt1, stride);
 7508   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
 7509   cmpl(cnt1, cnt2);
 7510   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
 7511   addptr(result, 16);
 7512 
 7513   bind(ADJUST_STR);
 7514   cmpl(cnt1, stride); // Do not read beyond string
 7515   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
 7516   // Back-up string to avoid reading beyond string.
 7517   lea(result, Address(result, cnt1, scale1, -16));
 7518   movl(cnt1, stride);
 7519   jmpb(SCAN_TO_SUBSTR);
 7520 
 7521   // Found a potential substr
 7522   bind(FOUND_CANDIDATE);
 7523   // After pcmpestri tmp(rcx) contains matched element index
 7524 
 7525   // Make sure string is still long enough
 7526   subl(cnt1, tmp);
 7527   cmpl(cnt1, cnt2);
 7528   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
 7529   // Left less then substring.
 7530 
 7531   bind(RET_NOT_FOUND);
 7532   movl(result, -1);
 7533   jmpb(CLEANUP);
 7534 
 7535   bind(FOUND_SUBSTR);
 7536   // Compute start addr of substr
 7537   lea(result, Address(result, tmp, scale1));
 7538   if (int_cnt2 > 0) { // Constant substring
 7539     // Repeat search for small substring (< 8 chars)
 7540     // from new point without reloading substring.
 7541     // Have to check that we don't read beyond string.
 7542     cmpl(tmp, stride-int_cnt2);
 7543     jccb(Assembler::greater, ADJUST_STR);
 7544     // Fall through if matched whole substring.
 7545   } else { // non constant
 7546     assert(int_cnt2 == -1, "should be != 0");
 7547 
 7548     addl(tmp, cnt2);
 7549     // Found result if we matched whole substring.
 7550     cmpl(tmp, stride);
 7551     jccb(Assembler::lessEqual, RET_FOUND);
 7552 
 7553     // Repeat search for small substring (<= 8 chars)
 7554     // from new point 'str1' without reloading substring.
 7555     cmpl(cnt2, stride);
 7556     // Have to check that we don't read beyond string.
 7557     jccb(Assembler::lessEqual, ADJUST_STR);
 7558 
 7559     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
 7560     // Compare the rest of substring (> 8 chars).
 7561     movptr(str1, result);
 7562 
 7563     cmpl(tmp, cnt2);
 7564     // First 8 chars are already matched.
 7565     jccb(Assembler::equal, CHECK_NEXT);
 7566 
 7567     bind(SCAN_SUBSTR);
 7568     pcmpestri(vec, Address(str1, 0), mode);
 7569     // Need to reload strings pointers if not matched whole vector
 7570     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
 7571 
 7572     bind(CHECK_NEXT);
 7573     subl(cnt2, stride);
 7574     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
 7575     addptr(str1, 16);
 7576     if (ae == StrIntrinsicNode::UL) {
 7577       addptr(str2, 8);
 7578     } else {
 7579       addptr(str2, 16);
 7580     }
 7581     subl(cnt1, stride);
 7582     cmpl(cnt2, stride); // Do not read beyond substring
 7583     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
 7584     // Back-up strings to avoid reading beyond substring.
 7585 
 7586     if (ae == StrIntrinsicNode::UL) {
 7587       lea(str2, Address(str2, cnt2, scale2, -8));
 7588       lea(str1, Address(str1, cnt2, scale1, -16));
 7589     } else {
 7590       lea(str2, Address(str2, cnt2, scale2, -16));
 7591       lea(str1, Address(str1, cnt2, scale1, -16));
 7592     }
 7593     subl(cnt1, cnt2);
 7594     movl(cnt2, stride);
 7595     addl(cnt1, stride);
 7596     bind(CONT_SCAN_SUBSTR);
 7597     if (ae == StrIntrinsicNode::UL) {
 7598       pmovzxbw(vec, Address(str2, 0));
 7599     } else {
 7600       movdqu(vec, Address(str2, 0));
 7601     }
 7602     jmp(SCAN_SUBSTR);
 7603 
 7604     bind(RET_FOUND_LONG);
 7605     movptr(str1, Address(rsp, wordSize));
 7606   } // non constant
 7607 
 7608   bind(RET_FOUND);
 7609   // Compute substr offset
 7610   subptr(result, str1);
 7611   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
 7612     shrl(result, 1); // index
 7613   }
 7614   bind(CLEANUP);
 7615   pop(rsp); // restore SP
 7616 
 7617 } // string_indexof
 7618 
 7619 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
 7620                                          XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
 7621   ShortBranchVerifier sbv(this);
 7622   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
 7623 
 7624   int stride = 8;
 7625 
 7626   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
 7627         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
 7628         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
 7629         FOUND_SEQ_CHAR, DONE_LABEL;
 7630 
 7631   movptr(result, str1);
 7632   if (UseAVX >= 2) {
 7633     cmpl(cnt1, stride);
 7634     jcc(Assembler::less, SCAN_TO_CHAR_LOOP);
 7635     cmpl(cnt1, 2*stride);
 7636     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
 7637     movdl(vec1, ch);
 7638     vpbroadcastw(vec1, vec1);
 7639     vpxor(vec2, vec2);
 7640     movl(tmp, cnt1);
 7641     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
 7642     andl(cnt1,0x0000000F);  //tail count (in chars)
 7643 
 7644     bind(SCAN_TO_16_CHAR_LOOP);
 7645     vmovdqu(vec3, Address(result, 0));
 7646     vpcmpeqw(vec3, vec3, vec1, 1);
 7647     vptest(vec2, vec3);
 7648     jcc(Assembler::carryClear, FOUND_CHAR);
 7649     addptr(result, 32);
 7650     subl(tmp, 2*stride);
 7651     jccb(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
 7652     jmp(SCAN_TO_8_CHAR);
 7653     bind(SCAN_TO_8_CHAR_INIT);
 7654     movdl(vec1, ch);
 7655     pshuflw(vec1, vec1, 0x00);
 7656     pshufd(vec1, vec1, 0);
 7657     pxor(vec2, vec2);
 7658   }
 7659   bind(SCAN_TO_8_CHAR);
 7660   cmpl(cnt1, stride);
 7661   if (UseAVX >= 2) {
 7662     jcc(Assembler::less, SCAN_TO_CHAR);
 7663   } else {
 7664     jcc(Assembler::less, SCAN_TO_CHAR_LOOP);
 7665     movdl(vec1, ch);
 7666     pshuflw(vec1, vec1, 0x00);
 7667     pshufd(vec1, vec1, 0);
 7668     pxor(vec2, vec2);
 7669   }
 7670   movl(tmp, cnt1);
 7671   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
 7672   andl(cnt1,0x00000007);  //tail count (in chars)
 7673 
 7674   bind(SCAN_TO_8_CHAR_LOOP);
 7675   movdqu(vec3, Address(result, 0));
 7676   pcmpeqw(vec3, vec1);
 7677   ptest(vec2, vec3);
 7678   jcc(Assembler::carryClear, FOUND_CHAR);
 7679   addptr(result, 16);
 7680   subl(tmp, stride);
 7681   jccb(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
 7682   bind(SCAN_TO_CHAR);
 7683   testl(cnt1, cnt1);
 7684   jcc(Assembler::zero, RET_NOT_FOUND);
 7685   bind(SCAN_TO_CHAR_LOOP);
 7686   load_unsigned_short(tmp, Address(result, 0));
 7687   cmpl(ch, tmp);
 7688   jccb(Assembler::equal, FOUND_SEQ_CHAR);
 7689   addptr(result, 2);
 7690   subl(cnt1, 1);
 7691   jccb(Assembler::zero, RET_NOT_FOUND);
 7692   jmp(SCAN_TO_CHAR_LOOP);
 7693 
 7694   bind(RET_NOT_FOUND);
 7695   movl(result, -1);
 7696   jmpb(DONE_LABEL);
 7697 
 7698   bind(FOUND_CHAR);
 7699   if (UseAVX >= 2) {
 7700     vpmovmskb(tmp, vec3);
 7701   } else {
 7702     pmovmskb(tmp, vec3);
 7703   }
 7704   bsfl(ch, tmp);
 7705   addl(result, ch);
 7706 
 7707   bind(FOUND_SEQ_CHAR);
 7708   subptr(result, str1);
 7709   shrl(result, 1);
 7710 
 7711   bind(DONE_LABEL);
 7712 } // string_indexof_char
 7713 
 7714 // helper function for string_compare
 7715 void MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
 7716                                         Address::ScaleFactor scale, Address::ScaleFactor scale1,
 7717                                         Address::ScaleFactor scale2, Register index, int ae) {
 7718   if (ae == StrIntrinsicNode::LL) {
 7719     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
 7720     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
 7721   } else if (ae == StrIntrinsicNode::UU) {
 7722     load_unsigned_short(elem1, Address(str1, index, scale, 0));
 7723     load_unsigned_short(elem2, Address(str2, index, scale, 0));
 7724   } else {
 7725     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
 7726     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
 7727   }
 7728 }
 7729 
 7730 // Compare strings, used for char[] and byte[].
 7731 void MacroAssembler::string_compare(Register str1, Register str2,
 7732                                     Register cnt1, Register cnt2, Register result,
 7733                                     XMMRegister vec1, int ae) {
 7734   ShortBranchVerifier sbv(this);
 7735   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
 7736   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
 7737   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
 7738   int stride2x2 = 0x40;
 7739   Address::ScaleFactor scale = Address::no_scale;
 7740   Address::ScaleFactor scale1 = Address::no_scale;
 7741   Address::ScaleFactor scale2 = Address::no_scale;
 7742 
 7743   if (ae != StrIntrinsicNode::LL) {
 7744     stride2x2 = 0x20;
 7745   }
 7746 
 7747   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
 7748     shrl(cnt2, 1);
 7749   }
 7750   // Compute the minimum of the string lengths and the
 7751   // difference of the string lengths (stack).
 7752   // Do the conditional move stuff
 7753   movl(result, cnt1);
 7754   subl(cnt1, cnt2);
 7755   push(cnt1);
 7756   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
 7757 
 7758   // Is the minimum length zero?
 7759   testl(cnt2, cnt2);
 7760   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
 7761   if (ae == StrIntrinsicNode::LL) {
 7762     // Load first bytes
 7763     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
 7764     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
 7765   } else if (ae == StrIntrinsicNode::UU) {
 7766     // Load first characters
 7767     load_unsigned_short(result, Address(str1, 0));
 7768     load_unsigned_short(cnt1, Address(str2, 0));
 7769   } else {
 7770     load_unsigned_byte(result, Address(str1, 0));
 7771     load_unsigned_short(cnt1, Address(str2, 0));
 7772   }
 7773   subl(result, cnt1);
 7774   jcc(Assembler::notZero,  POP_LABEL);
 7775 
 7776   if (ae == StrIntrinsicNode::UU) {
 7777     // Divide length by 2 to get number of chars
 7778     shrl(cnt2, 1);
 7779   }
 7780   cmpl(cnt2, 1);
 7781   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
 7782 
 7783   // Check if the strings start at the same location and setup scale and stride
 7784   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 7785     cmpptr(str1, str2);
 7786     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
 7787     if (ae == StrIntrinsicNode::LL) {
 7788       scale = Address::times_1;
 7789       stride = 16;
 7790     } else {
 7791       scale = Address::times_2;
 7792       stride = 8;
 7793     }
 7794   } else {
 7795     scale1 = Address::times_1;
 7796     scale2 = Address::times_2;
 7797     // scale not used
 7798     stride = 8;
 7799   }
 7800 
 7801   if (UseAVX >= 2 && UseSSE42Intrinsics) {
 7802     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
 7803     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
 7804     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
 7805     Label COMPARE_TAIL_LONG;
 7806     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
 7807 
 7808     int pcmpmask = 0x19;
 7809     if (ae == StrIntrinsicNode::LL) {
 7810       pcmpmask &= ~0x01;
 7811     }
 7812 
 7813     // Setup to compare 16-chars (32-bytes) vectors,
 7814     // start from first character again because it has aligned address.
 7815     if (ae == StrIntrinsicNode::LL) {
 7816       stride2 = 32;
 7817     } else {
 7818       stride2 = 16;
 7819     }
 7820     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 7821       adr_stride = stride << scale;
 7822     } else {
 7823       adr_stride1 = 8;  //stride << scale1;
 7824       adr_stride2 = 16; //stride << scale2;
 7825     }
 7826 
 7827     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
 7828     // rax and rdx are used by pcmpestri as elements counters
 7829     movl(result, cnt2);
 7830     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
 7831     jcc(Assembler::zero, COMPARE_TAIL_LONG);
 7832 
 7833     // fast path : compare first 2 8-char vectors.
 7834     bind(COMPARE_16_CHARS);
 7835     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 7836       movdqu(vec1, Address(str1, 0));
 7837     } else {
 7838       pmovzxbw(vec1, Address(str1, 0));
 7839     }
 7840     pcmpestri(vec1, Address(str2, 0), pcmpmask);
 7841     jccb(Assembler::below, COMPARE_INDEX_CHAR);
 7842 
 7843     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 7844       movdqu(vec1, Address(str1, adr_stride));
 7845       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
 7846     } else {
 7847       pmovzxbw(vec1, Address(str1, adr_stride1));
 7848       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
 7849     }
 7850     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
 7851     addl(cnt1, stride);
 7852 
 7853     // Compare the characters at index in cnt1
 7854     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
 7855     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
 7856     subl(result, cnt2);
 7857     jmp(POP_LABEL);
 7858 
 7859     // Setup the registers to start vector comparison loop
 7860     bind(COMPARE_WIDE_VECTORS);
 7861     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 7862       lea(str1, Address(str1, result, scale));
 7863       lea(str2, Address(str2, result, scale));
 7864     } else {
 7865       lea(str1, Address(str1, result, scale1));
 7866       lea(str2, Address(str2, result, scale2));
 7867     }
 7868     subl(result, stride2);
 7869     subl(cnt2, stride2);
 7870     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
 7871     negptr(result);
 7872 
 7873     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
 7874     bind(COMPARE_WIDE_VECTORS_LOOP);
 7875 
 7876 #ifdef _LP64
 7877     if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
 7878       cmpl(cnt2, stride2x2);
 7879       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
 7880       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
 7881       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
 7882 
 7883       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
 7884       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 7885         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
 7886         evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
 7887       } else {
 7888         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
 7889         evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
 7890       }
 7891       kortestql(k7, k7);
 7892       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
 7893       addptr(result, stride2x2);  // update since we already compared at this addr
 7894       subl(cnt2, stride2x2);      // and sub the size too
 7895       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
 7896 
 7897       vpxor(vec1, vec1);
 7898       jmpb(COMPARE_WIDE_TAIL);
 7899     }//if (VM_Version::supports_avx512vlbw())
 7900 #endif // _LP64
 7901 
 7902 
 7903     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
 7904     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 7905       vmovdqu(vec1, Address(str1, result, scale));
 7906       vpxor(vec1, Address(str2, result, scale));
 7907     } else {
 7908       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
 7909       vpxor(vec1, Address(str2, result, scale2));
 7910     }
 7911     vptest(vec1, vec1);
 7912     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
 7913     addptr(result, stride2);
 7914     subl(cnt2, stride2);
 7915     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
 7916     // clean upper bits of YMM registers
 7917     vpxor(vec1, vec1);
 7918 
 7919     // compare wide vectors tail
 7920     bind(COMPARE_WIDE_TAIL);
 7921     testptr(result, result);
 7922     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
 7923 
 7924     movl(result, stride2);
 7925     movl(cnt2, result);
 7926     negptr(result);
 7927     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
 7928 
 7929     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
 7930     bind(VECTOR_NOT_EQUAL);
 7931     // clean upper bits of YMM registers
 7932     vpxor(vec1, vec1);
 7933     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 7934       lea(str1, Address(str1, result, scale));
 7935       lea(str2, Address(str2, result, scale));
 7936     } else {
 7937       lea(str1, Address(str1, result, scale1));
 7938       lea(str2, Address(str2, result, scale2));
 7939     }
 7940     jmp(COMPARE_16_CHARS);
 7941 
 7942     // Compare tail chars, length between 1 to 15 chars
 7943     bind(COMPARE_TAIL_LONG);
 7944     movl(cnt2, result);
 7945     cmpl(cnt2, stride);
 7946     jcc(Assembler::less, COMPARE_SMALL_STR);
 7947 
 7948     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 7949       movdqu(vec1, Address(str1, 0));
 7950     } else {
 7951       pmovzxbw(vec1, Address(str1, 0));
 7952     }
 7953     pcmpestri(vec1, Address(str2, 0), pcmpmask);
 7954     jcc(Assembler::below, COMPARE_INDEX_CHAR);
 7955     subptr(cnt2, stride);
 7956     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
 7957     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 7958       lea(str1, Address(str1, result, scale));
 7959       lea(str2, Address(str2, result, scale));
 7960     } else {
 7961       lea(str1, Address(str1, result, scale1));
 7962       lea(str2, Address(str2, result, scale2));
 7963     }
 7964     negptr(cnt2);
 7965     jmpb(WHILE_HEAD_LABEL);
 7966 
 7967     bind(COMPARE_SMALL_STR);
 7968   } else if (UseSSE42Intrinsics) {
 7969     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
 7970     int pcmpmask = 0x19;
 7971     // Setup to compare 8-char (16-byte) vectors,
 7972     // start from first character again because it has aligned address.
 7973     movl(result, cnt2);
 7974     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
 7975     if (ae == StrIntrinsicNode::LL) {
 7976       pcmpmask &= ~0x01;
 7977     }
 7978     jcc(Assembler::zero, COMPARE_TAIL);
 7979     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 7980       lea(str1, Address(str1, result, scale));
 7981       lea(str2, Address(str2, result, scale));
 7982     } else {
 7983       lea(str1, Address(str1, result, scale1));
 7984       lea(str2, Address(str2, result, scale2));
 7985     }
 7986     negptr(result);
 7987 
 7988     // pcmpestri
 7989     //   inputs:
 7990     //     vec1- substring
 7991     //     rax - negative string length (elements count)
 7992     //     mem - scanned string
 7993     //     rdx - string length (elements count)
 7994     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
 7995     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
 7996     //   outputs:
 7997     //     rcx - first mismatched element index
 7998     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
 7999 
 8000     bind(COMPARE_WIDE_VECTORS);
 8001     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 8002       movdqu(vec1, Address(str1, result, scale));
 8003       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
 8004     } else {
 8005       pmovzxbw(vec1, Address(str1, result, scale1));
 8006       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
 8007     }
 8008     // After pcmpestri cnt1(rcx) contains mismatched element index
 8009 
 8010     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
 8011     addptr(result, stride);
 8012     subptr(cnt2, stride);
 8013     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
 8014 
 8015     // compare wide vectors tail
 8016     testptr(result, result);
 8017     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
 8018 
 8019     movl(cnt2, stride);
 8020     movl(result, stride);
 8021     negptr(result);
 8022     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 8023       movdqu(vec1, Address(str1, result, scale));
 8024       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
 8025     } else {
 8026       pmovzxbw(vec1, Address(str1, result, scale1));
 8027       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
 8028     }
 8029     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
 8030 
 8031     // Mismatched characters in the vectors
 8032     bind(VECTOR_NOT_EQUAL);
 8033     addptr(cnt1, result);
 8034     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
 8035     subl(result, cnt2);
 8036     jmpb(POP_LABEL);
 8037 
 8038     bind(COMPARE_TAIL); // limit is zero
 8039     movl(cnt2, result);
 8040     // Fallthru to tail compare
 8041   }
 8042   // Shift str2 and str1 to the end of the arrays, negate min
 8043   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
 8044     lea(str1, Address(str1, cnt2, scale));
 8045     lea(str2, Address(str2, cnt2, scale));
 8046   } else {
 8047     lea(str1, Address(str1, cnt2, scale1));
 8048     lea(str2, Address(str2, cnt2, scale2));
 8049   }
 8050   decrementl(cnt2);  // first character was compared already
 8051   negptr(cnt2);
 8052 
 8053   // Compare the rest of the elements
 8054   bind(WHILE_HEAD_LABEL);
 8055   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
 8056   subl(result, cnt1);
 8057   jccb(Assembler::notZero, POP_LABEL);
 8058   increment(cnt2);
 8059   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
 8060 
 8061   // Strings are equal up to min length.  Return the length difference.
 8062   bind(LENGTH_DIFF_LABEL);
 8063   pop(result);
 8064   if (ae == StrIntrinsicNode::UU) {
 8065     // Divide diff by 2 to get number of chars
 8066     sarl(result, 1);
 8067   }
 8068   jmpb(DONE_LABEL);
 8069 
 8070 #ifdef _LP64
 8071   if (VM_Version::supports_avx512vlbw()) {
 8072 
 8073     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
 8074 
 8075     kmovql(cnt1, k7);
 8076     notq(cnt1);
 8077     bsfq(cnt2, cnt1);
 8078     if (ae != StrIntrinsicNode::LL) {
 8079       // Divide diff by 2 to get number of chars
 8080       sarl(cnt2, 1);
 8081     }
 8082     addq(result, cnt2);
 8083     if (ae == StrIntrinsicNode::LL) {
 8084       load_unsigned_byte(cnt1, Address(str2, result));
 8085       load_unsigned_byte(result, Address(str1, result));
 8086     } else if (ae == StrIntrinsicNode::UU) {
 8087       load_unsigned_short(cnt1, Address(str2, result, scale));
 8088       load_unsigned_short(result, Address(str1, result, scale));
 8089     } else {
 8090       load_unsigned_short(cnt1, Address(str2, result, scale2));
 8091       load_unsigned_byte(result, Address(str1, result, scale1));
 8092     }
 8093     subl(result, cnt1);
 8094     jmpb(POP_LABEL);
 8095   }//if (VM_Version::supports_avx512vlbw())
 8096 #endif // _LP64
 8097 
 8098   // Discard the stored length difference
 8099   bind(POP_LABEL);
 8100   pop(cnt1);
 8101 
 8102   // That's it
 8103   bind(DONE_LABEL);
 8104   if(ae == StrIntrinsicNode::UL) {
 8105     negl(result);
 8106   }
 8107 
 8108 }
 8109 
 8110 // Search for Non-ASCII character (Negative byte value) in a byte array,
 8111 // return true if it has any and false otherwise.
 8112 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
 8113 //   @HotSpotIntrinsicCandidate
 8114 //   private static boolean hasNegatives(byte[] ba, int off, int len) {
 8115 //     for (int i = off; i < off + len; i++) {
 8116 //       if (ba[i] < 0) {
 8117 //         return true;
 8118 //       }
 8119 //     }
 8120 //     return false;
 8121 //   }
 8122 void MacroAssembler::has_negatives(Register ary1, Register len,
 8123   Register result, Register tmp1,
 8124   XMMRegister vec1, XMMRegister vec2) {
 8125   // rsi: byte array
 8126   // rcx: len
 8127   // rax: result
 8128   ShortBranchVerifier sbv(this);
 8129   assert_different_registers(ary1, len, result, tmp1);
 8130   assert_different_registers(vec1, vec2);
 8131   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
 8132 
 8133   // len == 0
 8134   testl(len, len);
 8135   jcc(Assembler::zero, FALSE_LABEL);
 8136 
 8137   if ((UseAVX > 2) && // AVX512
 8138     VM_Version::supports_avx512vlbw() &&
 8139     VM_Version::supports_bmi2()) {
 8140 
 8141     set_vector_masking();  // opening of the stub context for programming mask registers
 8142 
 8143     Label test_64_loop, test_tail;
 8144     Register tmp3_aliased = len;
 8145 
 8146     movl(tmp1, len);
 8147     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
 8148 
 8149     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
 8150     andl(len, ~(64 - 1));    // vector count (in chars)
 8151     jccb(Assembler::zero, test_tail);
 8152 
 8153     lea(ary1, Address(ary1, len, Address::times_1));
 8154     negptr(len);
 8155 
 8156     bind(test_64_loop);
 8157     // Check whether our 64 elements of size byte contain negatives
 8158     evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
 8159     kortestql(k2, k2);
 8160     jcc(Assembler::notZero, TRUE_LABEL);
 8161 
 8162     addptr(len, 64);
 8163     jccb(Assembler::notZero, test_64_loop);
 8164 
 8165 
 8166     bind(test_tail);
 8167     // bail out when there is nothing to be done
 8168     testl(tmp1, -1);
 8169     jcc(Assembler::zero, FALSE_LABEL);
 8170 
 8171     // Save k1
 8172     kmovql(k3, k1);
 8173 
 8174     // ~(~0 << len) applied up to two times (for 32-bit scenario)
 8175 #ifdef _LP64
 8176     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
 8177     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
 8178     notq(tmp3_aliased);
 8179     kmovql(k1, tmp3_aliased);
 8180 #else
 8181     Label k_init;
 8182     jmp(k_init);
 8183 
 8184     // We could not read 64-bits from a general purpose register thus we move
 8185     // data required to compose 64 1's to the instruction stream
 8186     // We emit 64 byte wide series of elements from 0..63 which later on would
 8187     // be used as a compare targets with tail count contained in tmp1 register.
 8188     // Result would be a k1 register having tmp1 consecutive number or 1
 8189     // counting from least significant bit.
 8190     address tmp = pc();
 8191     emit_int64(0x0706050403020100);
 8192     emit_int64(0x0F0E0D0C0B0A0908);
 8193     emit_int64(0x1716151413121110);
 8194     emit_int64(0x1F1E1D1C1B1A1918);
 8195     emit_int64(0x2726252423222120);
 8196     emit_int64(0x2F2E2D2C2B2A2928);
 8197     emit_int64(0x3736353433323130);
 8198     emit_int64(0x3F3E3D3C3B3A3938);
 8199 
 8200     bind(k_init);
 8201     lea(len, InternalAddress(tmp));
 8202     // create mask to test for negative byte inside a vector
 8203     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
 8204     evpcmpgtb(k1, vec1, Address(len, 0), Assembler::AVX_512bit);
 8205 
 8206 #endif
 8207     evpcmpgtb(k2, k1, vec2, Address(ary1, 0), Assembler::AVX_512bit);
 8208     ktestq(k2, k1);
 8209     // Restore k1
 8210     kmovql(k1, k3);
 8211     jcc(Assembler::notZero, TRUE_LABEL);
 8212 
 8213     jmp(FALSE_LABEL);
 8214 
 8215     clear_vector_masking();   // closing of the stub context for programming mask registers
 8216   } else {
 8217     movl(result, len); // copy
 8218 
 8219     if (UseAVX == 2 && UseSSE >= 2) {
 8220       // With AVX2, use 32-byte vector compare
 8221       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
 8222 
 8223       // Compare 32-byte vectors
 8224       andl(result, 0x0000001f);  //   tail count (in bytes)
 8225       andl(len, 0xffffffe0);   // vector count (in bytes)
 8226       jccb(Assembler::zero, COMPARE_TAIL);
 8227 
 8228       lea(ary1, Address(ary1, len, Address::times_1));
 8229       negptr(len);
 8230 
 8231       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
 8232       movdl(vec2, tmp1);
 8233       vpbroadcastd(vec2, vec2);
 8234 
 8235       bind(COMPARE_WIDE_VECTORS);
 8236       vmovdqu(vec1, Address(ary1, len, Address::times_1));
 8237       vptest(vec1, vec2);
 8238       jccb(Assembler::notZero, TRUE_LABEL);
 8239       addptr(len, 32);
 8240       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
 8241 
 8242       testl(result, result);
 8243       jccb(Assembler::zero, FALSE_LABEL);
 8244 
 8245       vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
 8246       vptest(vec1, vec2);
 8247       jccb(Assembler::notZero, TRUE_LABEL);
 8248       jmpb(FALSE_LABEL);
 8249 
 8250       bind(COMPARE_TAIL); // len is zero
 8251       movl(len, result);
 8252       // Fallthru to tail compare
 8253     } else if (UseSSE42Intrinsics) {
 8254       // With SSE4.2, use double quad vector compare
 8255       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
 8256 
 8257       // Compare 16-byte vectors
 8258       andl(result, 0x0000000f);  //   tail count (in bytes)
 8259       andl(len, 0xfffffff0);   // vector count (in bytes)
 8260       jccb(Assembler::zero, COMPARE_TAIL);
 8261 
 8262       lea(ary1, Address(ary1, len, Address::times_1));
 8263       negptr(len);
 8264 
 8265       movl(tmp1, 0x80808080);
 8266       movdl(vec2, tmp1);
 8267       pshufd(vec2, vec2, 0);
 8268 
 8269       bind(COMPARE_WIDE_VECTORS);
 8270       movdqu(vec1, Address(ary1, len, Address::times_1));
 8271       ptest(vec1, vec2);
 8272       jccb(Assembler::notZero, TRUE_LABEL);
 8273       addptr(len, 16);
 8274       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
 8275 
 8276       testl(result, result);
 8277       jccb(Assembler::zero, FALSE_LABEL);
 8278 
 8279       movdqu(vec1, Address(ary1, result, Address::times_1, -16));
 8280       ptest(vec1, vec2);
 8281       jccb(Assembler::notZero, TRUE_LABEL);
 8282       jmpb(FALSE_LABEL);
 8283 
 8284       bind(COMPARE_TAIL); // len is zero
 8285       movl(len, result);
 8286       // Fallthru to tail compare
 8287     }
 8288   }
 8289   // Compare 4-byte vectors
 8290   andl(len, 0xfffffffc); // vector count (in bytes)
 8291   jccb(Assembler::zero, COMPARE_CHAR);
 8292 
 8293   lea(ary1, Address(ary1, len, Address::times_1));
 8294   negptr(len);
 8295 
 8296   bind(COMPARE_VECTORS);
 8297   movl(tmp1, Address(ary1, len, Address::times_1));
 8298   andl(tmp1, 0x80808080);
 8299   jccb(Assembler::notZero, TRUE_LABEL);
 8300   addptr(len, 4);
 8301   jcc(Assembler::notZero, COMPARE_VECTORS);
 8302 
 8303   // Compare trailing char (final 2 bytes), if any
 8304   bind(COMPARE_CHAR);
 8305   testl(result, 0x2);   // tail  char
 8306   jccb(Assembler::zero, COMPARE_BYTE);
 8307   load_unsigned_short(tmp1, Address(ary1, 0));
 8308   andl(tmp1, 0x00008080);
 8309   jccb(Assembler::notZero, TRUE_LABEL);
 8310   subptr(result, 2);
 8311   lea(ary1, Address(ary1, 2));
 8312 
 8313   bind(COMPARE_BYTE);
 8314   testl(result, 0x1);   // tail  byte
 8315   jccb(Assembler::zero, FALSE_LABEL);
 8316   load_unsigned_byte(tmp1, Address(ary1, 0));
 8317   andl(tmp1, 0x00000080);
 8318   jccb(Assembler::notEqual, TRUE_LABEL);
 8319   jmpb(FALSE_LABEL);
 8320 
 8321   bind(TRUE_LABEL);
 8322   movl(result, 1);   // return true
 8323   jmpb(DONE);
 8324 
 8325   bind(FALSE_LABEL);
 8326   xorl(result, result); // return false
 8327 
 8328   // That's it
 8329   bind(DONE);
 8330   if (UseAVX >= 2 && UseSSE >= 2) {
 8331     // clean upper bits of YMM registers
 8332     vpxor(vec1, vec1);
 8333     vpxor(vec2, vec2);
 8334   }
 8335 }
 8336 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
 8337 void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
 8338                                    Register limit, Register result, Register chr,
 8339                                    XMMRegister vec1, XMMRegister vec2, bool is_char) {
 8340   ShortBranchVerifier sbv(this);
 8341   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
 8342 
 8343   int length_offset  = arrayOopDesc::length_offset_in_bytes();
 8344   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
 8345 
 8346   if (is_array_equ) {
 8347     // Check the input args
 8348     cmpoop(ary1, ary2);
 8349     jcc(Assembler::equal, TRUE_LABEL);
 8350 
 8351     // Need additional checks for arrays_equals.
 8352     testptr(ary1, ary1);
 8353     jcc(Assembler::zero, FALSE_LABEL);
 8354     testptr(ary2, ary2);
 8355     jcc(Assembler::zero, FALSE_LABEL);
 8356 
 8357     // Check the lengths
 8358     movl(limit, Address(ary1, length_offset));
 8359     cmpl(limit, Address(ary2, length_offset));
 8360     jcc(Assembler::notEqual, FALSE_LABEL);
 8361   }
 8362 
 8363   // count == 0
 8364   testl(limit, limit);
 8365   jcc(Assembler::zero, TRUE_LABEL);
 8366 
 8367   if (is_array_equ) {
 8368     // Load array address
 8369     lea(ary1, Address(ary1, base_offset));
 8370     lea(ary2, Address(ary2, base_offset));
 8371   }
 8372 
 8373   if (is_array_equ && is_char) {
 8374     // arrays_equals when used for char[].
 8375     shll(limit, 1);      // byte count != 0
 8376   }
 8377   movl(result, limit); // copy
 8378 
 8379   if (UseAVX >= 2) {
 8380     // With AVX2, use 32-byte vector compare
 8381     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
 8382 
 8383     // Compare 32-byte vectors
 8384     andl(result, 0x0000001f);  //   tail count (in bytes)
 8385     andl(limit, 0xffffffe0);   // vector count (in bytes)
 8386     jcc(Assembler::zero, COMPARE_TAIL);
 8387 
 8388     lea(ary1, Address(ary1, limit, Address::times_1));
 8389     lea(ary2, Address(ary2, limit, Address::times_1));
 8390     negptr(limit);
 8391 
 8392     bind(COMPARE_WIDE_VECTORS);
 8393 
 8394 #ifdef _LP64
 8395     if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
 8396       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
 8397 
 8398       cmpl(limit, -64);
 8399       jccb(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
 8400 
 8401       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
 8402 
 8403       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
 8404       evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
 8405       kortestql(k7, k7);
 8406       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
 8407       addptr(limit, 64);  // update since we already compared at this addr
 8408       cmpl(limit, -64);
 8409       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
 8410 
 8411       // At this point we may still need to compare -limit+result bytes.
 8412       // We could execute the next two instruction and just continue via non-wide path:
 8413       //  cmpl(limit, 0);
 8414       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
 8415       // But since we stopped at the points ary{1,2}+limit which are
 8416       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
 8417       // (|limit| <= 32 and result < 32),
 8418       // we may just compare the last 64 bytes.
 8419       //
 8420       addptr(result, -64);   // it is safe, bc we just came from this area
 8421       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
 8422       evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
 8423       kortestql(k7, k7);
 8424       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
 8425 
 8426       jmp(TRUE_LABEL);
 8427 
 8428       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
 8429 
 8430     }//if (VM_Version::supports_avx512vlbw())
 8431 #endif //_LP64
 8432 
 8433     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
 8434     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
 8435     vpxor(vec1, vec2);
 8436 
 8437     vptest(vec1, vec1);
 8438     jcc(Assembler::notZero, FALSE_LABEL);
 8439     addptr(limit, 32);
 8440     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
 8441 
 8442     testl(result, result);
 8443     jcc(Assembler::zero, TRUE_LABEL);
 8444 
 8445     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
 8446     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
 8447     vpxor(vec1, vec2);
 8448 
 8449     vptest(vec1, vec1);
 8450     jccb(Assembler::notZero, FALSE_LABEL);
 8451     jmpb(TRUE_LABEL);
 8452 
 8453     bind(COMPARE_TAIL); // limit is zero
 8454     movl(limit, result);
 8455     // Fallthru to tail compare
 8456   } else if (UseSSE42Intrinsics) {
 8457     // With SSE4.2, use double quad vector compare
 8458     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
 8459 
 8460     // Compare 16-byte vectors
 8461     andl(result, 0x0000000f);  //   tail count (in bytes)
 8462     andl(limit, 0xfffffff0);   // vector count (in bytes)
 8463     jcc(Assembler::zero, COMPARE_TAIL);
 8464 
 8465     lea(ary1, Address(ary1, limit, Address::times_1));
 8466     lea(ary2, Address(ary2, limit, Address::times_1));
 8467     negptr(limit);
 8468 
 8469     bind(COMPARE_WIDE_VECTORS);
 8470     movdqu(vec1, Address(ary1, limit, Address::times_1));
 8471     movdqu(vec2, Address(ary2, limit, Address::times_1));
 8472     pxor(vec1, vec2);
 8473 
 8474     ptest(vec1, vec1);
 8475     jcc(Assembler::notZero, FALSE_LABEL);
 8476     addptr(limit, 16);
 8477     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
 8478 
 8479     testl(result, result);
 8480     jcc(Assembler::zero, TRUE_LABEL);
 8481 
 8482     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
 8483     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
 8484     pxor(vec1, vec2);
 8485 
 8486     ptest(vec1, vec1);
 8487     jccb(Assembler::notZero, FALSE_LABEL);
 8488     jmpb(TRUE_LABEL);
 8489 
 8490     bind(COMPARE_TAIL); // limit is zero
 8491     movl(limit, result);
 8492     // Fallthru to tail compare
 8493   }
 8494 
 8495   // Compare 4-byte vectors
 8496   andl(limit, 0xfffffffc); // vector count (in bytes)
 8497   jccb(Assembler::zero, COMPARE_CHAR);
 8498 
 8499   lea(ary1, Address(ary1, limit, Address::times_1));
 8500   lea(ary2, Address(ary2, limit, Address::times_1));
 8501   negptr(limit);
 8502 
 8503   bind(COMPARE_VECTORS);
 8504   movl(chr, Address(ary1, limit, Address::times_1));
 8505   cmpl(chr, Address(ary2, limit, Address::times_1));
 8506   jccb(Assembler::notEqual, FALSE_LABEL);
 8507   addptr(limit, 4);
 8508   jcc(Assembler::notZero, COMPARE_VECTORS);
 8509 
 8510   // Compare trailing char (final 2 bytes), if any
 8511   bind(COMPARE_CHAR);
 8512   testl(result, 0x2);   // tail  char
 8513   jccb(Assembler::zero, COMPARE_BYTE);
 8514   load_unsigned_short(chr, Address(ary1, 0));
 8515   load_unsigned_short(limit, Address(ary2, 0));
 8516   cmpl(chr, limit);
 8517   jccb(Assembler::notEqual, FALSE_LABEL);
 8518 
 8519   if (is_array_equ && is_char) {
 8520     bind(COMPARE_BYTE);
 8521   } else {
 8522     lea(ary1, Address(ary1, 2));
 8523     lea(ary2, Address(ary2, 2));
 8524 
 8525     bind(COMPARE_BYTE);
 8526     testl(result, 0x1);   // tail  byte
 8527     jccb(Assembler::zero, TRUE_LABEL);
 8528     load_unsigned_byte(chr, Address(ary1, 0));
 8529     load_unsigned_byte(limit, Address(ary2, 0));
 8530     cmpl(chr, limit);
 8531     jccb(Assembler::notEqual, FALSE_LABEL);
 8532   }
 8533   bind(TRUE_LABEL);
 8534   movl(result, 1);   // return true
 8535   jmpb(DONE);
 8536 
 8537   bind(FALSE_LABEL);
 8538   xorl(result, result); // return false
 8539 
 8540   // That's it
 8541   bind(DONE);
 8542   if (UseAVX >= 2) {
 8543     // clean upper bits of YMM registers
 8544     vpxor(vec1, vec1);
 8545     vpxor(vec2, vec2);
 8546   }
 8547 }
 8548 
 8549 #endif
 8550 
 8551 void MacroAssembler::generate_fill(BasicType t, bool aligned,
 8552                                    Register to, Register value, Register count,
 8553                                    Register rtmp, XMMRegister xtmp) {
 8554   ShortBranchVerifier sbv(this);
 8555   assert_different_registers(to, value, count, rtmp);
 8556   Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
 8557   Label L_fill_2_bytes, L_fill_4_bytes;
 8558 
 8559   int shift = -1;
 8560   switch (t) {
 8561     case T_BYTE:
 8562       shift = 2;
 8563       break;
 8564     case T_SHORT:
 8565       shift = 1;
 8566       break;
 8567     case T_INT:
 8568       shift = 0;
 8569       break;
 8570     default: ShouldNotReachHere();
 8571   }
 8572 
 8573   if (t == T_BYTE) {
 8574     andl(value, 0xff);
 8575     movl(rtmp, value);
 8576     shll(rtmp, 8);
 8577     orl(value, rtmp);
 8578   }
 8579   if (t == T_SHORT) {
 8580     andl(value, 0xffff);
 8581   }
 8582   if (t == T_BYTE || t == T_SHORT) {
 8583     movl(rtmp, value);
 8584     shll(rtmp, 16);
 8585     orl(value, rtmp);
 8586   }
 8587 
 8588   cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
 8589   jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
 8590   if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
 8591     // align source address at 4 bytes address boundary
 8592     if (t == T_BYTE) {
 8593       // One byte misalignment happens only for byte arrays
 8594       testptr(to, 1);
 8595       jccb(Assembler::zero, L_skip_align1);
 8596       movb(Address(to, 0), value);
 8597       increment(to);
 8598       decrement(count);
 8599       BIND(L_skip_align1);
 8600     }
 8601     // Two bytes misalignment happens only for byte and short (char) arrays
 8602     testptr(to, 2);
 8603     jccb(Assembler::zero, L_skip_align2);
 8604     movw(Address(to, 0), value);
 8605     addptr(to, 2);
 8606     subl(count, 1<<(shift-1));
 8607     BIND(L_skip_align2);
 8608   }
 8609   if (UseSSE < 2) {
 8610     Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
 8611     // Fill 32-byte chunks
 8612     subl(count, 8 << shift);
 8613     jcc(Assembler::less, L_check_fill_8_bytes);
 8614     align(16);
 8615 
 8616     BIND(L_fill_32_bytes_loop);
 8617 
 8618     for (int i = 0; i < 32; i += 4) {
 8619       movl(Address(to, i), value);
 8620     }
 8621 
 8622     addptr(to, 32);
 8623     subl(count, 8 << shift);
 8624     jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
 8625     BIND(L_check_fill_8_bytes);
 8626     addl(count, 8 << shift);
 8627     jccb(Assembler::zero, L_exit);
 8628     jmpb(L_fill_8_bytes);
 8629 
 8630     //
 8631     // length is too short, just fill qwords
 8632     //
 8633     BIND(L_fill_8_bytes_loop);
 8634     movl(Address(to, 0), value);
 8635     movl(Address(to, 4), value);
 8636     addptr(to, 8);
 8637     BIND(L_fill_8_bytes);
 8638     subl(count, 1 << (shift + 1));
 8639     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
 8640     // fall through to fill 4 bytes
 8641   } else {
 8642     Label L_fill_32_bytes;
 8643     if (!UseUnalignedLoadStores) {
 8644       // align to 8 bytes, we know we are 4 byte aligned to start
 8645       testptr(to, 4);
 8646       jccb(Assembler::zero, L_fill_32_bytes);
 8647       movl(Address(to, 0), value);
 8648       addptr(to, 4);
 8649       subl(count, 1<<shift);
 8650     }
 8651     BIND(L_fill_32_bytes);
 8652     {
 8653       assert( UseSSE >= 2, "supported cpu only" );
 8654       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
 8655       if (UseAVX > 2) {
 8656         movl(rtmp, 0xffff);
 8657         kmovwl(k1, rtmp);
 8658       }
 8659       movdl(xtmp, value);
 8660       if (UseAVX > 2 && UseUnalignedLoadStores) {
 8661         // Fill 64-byte chunks
 8662         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
 8663         evpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
 8664 
 8665         subl(count, 16 << shift);
 8666         jcc(Assembler::less, L_check_fill_32_bytes);
 8667         align(16);
 8668 
 8669         BIND(L_fill_64_bytes_loop);
 8670         evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
 8671         addptr(to, 64);
 8672         subl(count, 16 << shift);
 8673         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
 8674 
 8675         BIND(L_check_fill_32_bytes);
 8676         addl(count, 8 << shift);
 8677         jccb(Assembler::less, L_check_fill_8_bytes);
 8678         vmovdqu(Address(to, 0), xtmp);
 8679         addptr(to, 32);
 8680         subl(count, 8 << shift);
 8681 
 8682         BIND(L_check_fill_8_bytes);
 8683       } else if (UseAVX == 2 && UseUnalignedLoadStores) {
 8684         // Fill 64-byte chunks
 8685         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
 8686         vpbroadcastd(xtmp, xtmp);
 8687 
 8688         subl(count, 16 << shift);
 8689         jcc(Assembler::less, L_check_fill_32_bytes);
 8690         align(16);
 8691 
 8692         BIND(L_fill_64_bytes_loop);
 8693         vmovdqu(Address(to, 0), xtmp);
 8694         vmovdqu(Address(to, 32), xtmp);
 8695         addptr(to, 64);
 8696         subl(count, 16 << shift);
 8697         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
 8698 
 8699         BIND(L_check_fill_32_bytes);
 8700         addl(count, 8 << shift);
 8701         jccb(Assembler::less, L_check_fill_8_bytes);
 8702         vmovdqu(Address(to, 0), xtmp);
 8703         addptr(to, 32);
 8704         subl(count, 8 << shift);
 8705 
 8706         BIND(L_check_fill_8_bytes);
 8707         // clean upper bits of YMM registers
 8708         movdl(xtmp, value);
 8709         pshufd(xtmp, xtmp, 0);
 8710       } else {
 8711         // Fill 32-byte chunks
 8712         pshufd(xtmp, xtmp, 0);
 8713 
 8714         subl(count, 8 << shift);
 8715         jcc(Assembler::less, L_check_fill_8_bytes);
 8716         align(16);
 8717 
 8718         BIND(L_fill_32_bytes_loop);
 8719 
 8720         if (UseUnalignedLoadStores) {
 8721           movdqu(Address(to, 0), xtmp);
 8722           movdqu(Address(to, 16), xtmp);
 8723         } else {
 8724           movq(Address(to, 0), xtmp);
 8725           movq(Address(to, 8), xtmp);
 8726           movq(Address(to, 16), xtmp);
 8727           movq(Address(to, 24), xtmp);
 8728         }
 8729 
 8730         addptr(to, 32);
 8731         subl(count, 8 << shift);
 8732         jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
 8733 
 8734         BIND(L_check_fill_8_bytes);
 8735       }
 8736       addl(count, 8 << shift);
 8737       jccb(Assembler::zero, L_exit);
 8738       jmpb(L_fill_8_bytes);
 8739 
 8740       //
 8741       // length is too short, just fill qwords
 8742       //
 8743       BIND(L_fill_8_bytes_loop);
 8744       movq(Address(to, 0), xtmp);
 8745       addptr(to, 8);
 8746       BIND(L_fill_8_bytes);
 8747       subl(count, 1 << (shift + 1));
 8748       jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
 8749     }
 8750   }
 8751   // fill trailing 4 bytes
 8752   BIND(L_fill_4_bytes);
 8753   testl(count, 1<<shift);
 8754   jccb(Assembler::zero, L_fill_2_bytes);
 8755   movl(Address(to, 0), value);
 8756   if (t == T_BYTE || t == T_SHORT) {
 8757     addptr(to, 4);
 8758     BIND(L_fill_2_bytes);
 8759     // fill trailing 2 bytes
 8760     testl(count, 1<<(shift-1));
 8761     jccb(Assembler::zero, L_fill_byte);
 8762     movw(Address(to, 0), value);
 8763     if (t == T_BYTE) {
 8764       addptr(to, 2);
 8765       BIND(L_fill_byte);
 8766       // fill trailing byte
 8767       testl(count, 1);
 8768       jccb(Assembler::zero, L_exit);
 8769       movb(Address(to, 0), value);
 8770     } else {
 8771       BIND(L_fill_byte);
 8772     }
 8773   } else {
 8774     BIND(L_fill_2_bytes);
 8775   }
 8776   BIND(L_exit);
 8777 }
 8778 
 8779 // encode char[] to byte[] in ISO_8859_1
 8780    //@HotSpotIntrinsicCandidate
 8781    //private static int implEncodeISOArray(byte[] sa, int sp,
 8782    //byte[] da, int dp, int len) {
 8783    //  int i = 0;
 8784    //  for (; i < len; i++) {
 8785    //    char c = StringUTF16.getChar(sa, sp++);
 8786    //    if (c > '\u00FF')
 8787    //      break;
 8788    //    da[dp++] = (byte)c;
 8789    //  }
 8790    //  return i;
 8791    //}
 8792 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
 8793   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
 8794   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
 8795   Register tmp5, Register result) {
 8796 
 8797   // rsi: src
 8798   // rdi: dst
 8799   // rdx: len
 8800   // rcx: tmp5
 8801   // rax: result
 8802   ShortBranchVerifier sbv(this);
 8803   assert_different_registers(src, dst, len, tmp5, result);
 8804   Label L_done, L_copy_1_char, L_copy_1_char_exit;
 8805 
 8806   // set result
 8807   xorl(result, result);
 8808   // check for zero length
 8809   testl(len, len);
 8810   jcc(Assembler::zero, L_done);
 8811 
 8812   movl(result, len);
 8813 
 8814   // Setup pointers
 8815   lea(src, Address(src, len, Address::times_2)); // char[]
 8816   lea(dst, Address(dst, len, Address::times_1)); // byte[]
 8817   negptr(len);
 8818 
 8819   if (UseSSE42Intrinsics || UseAVX >= 2) {
 8820     Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit;
 8821     Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
 8822 
 8823     if (UseAVX >= 2) {
 8824       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
 8825       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
 8826       movdl(tmp1Reg, tmp5);
 8827       vpbroadcastd(tmp1Reg, tmp1Reg);
 8828       jmp(L_chars_32_check);
 8829 
 8830       bind(L_copy_32_chars);
 8831       vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
 8832       vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
 8833       vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
 8834       vptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
 8835       jccb(Assembler::notZero, L_copy_32_chars_exit);
 8836       vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
 8837       vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
 8838       vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
 8839 
 8840       bind(L_chars_32_check);
 8841       addptr(len, 32);
 8842       jcc(Assembler::lessEqual, L_copy_32_chars);
 8843 
 8844       bind(L_copy_32_chars_exit);
 8845       subptr(len, 16);
 8846       jccb(Assembler::greater, L_copy_16_chars_exit);
 8847 
 8848     } else if (UseSSE42Intrinsics) {
 8849       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
 8850       movdl(tmp1Reg, tmp5);
 8851       pshufd(tmp1Reg, tmp1Reg, 0);
 8852       jmpb(L_chars_16_check);
 8853     }
 8854 
 8855     bind(L_copy_16_chars);
 8856     if (UseAVX >= 2) {
 8857       vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
 8858       vptest(tmp2Reg, tmp1Reg);
 8859       jcc(Assembler::notZero, L_copy_16_chars_exit);
 8860       vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
 8861       vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
 8862     } else {
 8863       if (UseAVX > 0) {
 8864         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
 8865         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
 8866         vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
 8867       } else {
 8868         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
 8869         por(tmp2Reg, tmp3Reg);
 8870         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
 8871         por(tmp2Reg, tmp4Reg);
 8872       }
 8873       ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
 8874       jccb(Assembler::notZero, L_copy_16_chars_exit);
 8875       packuswb(tmp3Reg, tmp4Reg);
 8876     }
 8877     movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
 8878 
 8879     bind(L_chars_16_check);
 8880     addptr(len, 16);
 8881     jcc(Assembler::lessEqual, L_copy_16_chars);
 8882 
 8883     bind(L_copy_16_chars_exit);
 8884     if (UseAVX >= 2) {
 8885       // clean upper bits of YMM registers
 8886       vpxor(tmp2Reg, tmp2Reg);
 8887       vpxor(tmp3Reg, tmp3Reg);
 8888       vpxor(tmp4Reg, tmp4Reg);
 8889       movdl(tmp1Reg, tmp5);
 8890       pshufd(tmp1Reg, tmp1Reg, 0);
 8891     }
 8892     subptr(len, 8);
 8893     jccb(Assembler::greater, L_copy_8_chars_exit);
 8894 
 8895     bind(L_copy_8_chars);
 8896     movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
 8897     ptest(tmp3Reg, tmp1Reg);
 8898     jccb(Assembler::notZero, L_copy_8_chars_exit);
 8899     packuswb(tmp3Reg, tmp1Reg);
 8900     movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
 8901     addptr(len, 8);
 8902     jccb(Assembler::lessEqual, L_copy_8_chars);
 8903 
 8904     bind(L_copy_8_chars_exit);
 8905     subptr(len, 8);
 8906     jccb(Assembler::zero, L_done);
 8907   }
 8908 
 8909   bind(L_copy_1_char);
 8910   load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
 8911   testl(tmp5, 0xff00);      // check if Unicode char
 8912   jccb(Assembler::notZero, L_copy_1_char_exit);
 8913   movb(Address(dst, len, Address::times_1, 0), tmp5);
 8914   addptr(len, 1);
 8915   jccb(Assembler::less, L_copy_1_char);
 8916 
 8917   bind(L_copy_1_char_exit);
 8918   addptr(result, len); // len is negative count of not processed elements
 8919 
 8920   bind(L_done);
 8921 }
 8922 
 8923 #ifdef _LP64
 8924 /**
 8925  * Helper for multiply_to_len().
 8926  */
 8927 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
 8928   addq(dest_lo, src1);
 8929   adcq(dest_hi, 0);
 8930   addq(dest_lo, src2);
 8931   adcq(dest_hi, 0);
 8932 }
 8933 
 8934 /**
 8935  * Multiply 64 bit by 64 bit first loop.
 8936  */
 8937 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
 8938                                            Register y, Register y_idx, Register z,
 8939                                            Register carry, Register product,
 8940                                            Register idx, Register kdx) {
 8941   //
 8942   //  jlong carry, x[], y[], z[];
 8943   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
 8944   //    huge_128 product = y[idx] * x[xstart] + carry;
 8945   //    z[kdx] = (jlong)product;
 8946   //    carry  = (jlong)(product >>> 64);
 8947   //  }
 8948   //  z[xstart] = carry;
 8949   //
 8950 
 8951   Label L_first_loop, L_first_loop_exit;
 8952   Label L_one_x, L_one_y, L_multiply;
 8953 
 8954   decrementl(xstart);
 8955   jcc(Assembler::negative, L_one_x);
 8956 
 8957   movq(x_xstart, Address(x, xstart, Address::times_4,  0));
 8958   rorq(x_xstart, 32); // convert big-endian to little-endian
 8959 
 8960   bind(L_first_loop);
 8961   decrementl(idx);
 8962   jcc(Assembler::negative, L_first_loop_exit);
 8963   decrementl(idx);
 8964   jcc(Assembler::negative, L_one_y);
 8965   movq(y_idx, Address(y, idx, Address::times_4,  0));
 8966   rorq(y_idx, 32); // convert big-endian to little-endian
 8967   bind(L_multiply);
 8968   movq(product, x_xstart);
 8969   mulq(y_idx); // product(rax) * y_idx -> rdx:rax
 8970   addq(product, carry);
 8971   adcq(rdx, 0);
 8972   subl(kdx, 2);
 8973   movl(Address(z, kdx, Address::times_4,  4), product);
 8974   shrq(product, 32);
 8975   movl(Address(z, kdx, Address::times_4,  0), product);
 8976   movq(carry, rdx);
 8977   jmp(L_first_loop);
 8978 
 8979   bind(L_one_y);
 8980   movl(y_idx, Address(y,  0));
 8981   jmp(L_multiply);
 8982 
 8983   bind(L_one_x);
 8984   movl(x_xstart, Address(x,  0));
 8985   jmp(L_first_loop);
 8986 
 8987   bind(L_first_loop_exit);
 8988 }
 8989 
 8990 /**
 8991  * Multiply 64 bit by 64 bit and add 128 bit.
 8992  */
 8993 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
 8994                                             Register yz_idx, Register idx,
 8995                                             Register carry, Register product, int offset) {
 8996   //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
 8997   //     z[kdx] = (jlong)product;
 8998 
 8999   movq(yz_idx, Address(y, idx, Address::times_4,  offset));
 9000   rorq(yz_idx, 32); // convert big-endian to little-endian
 9001   movq(product, x_xstart);
 9002   mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
 9003   movq(yz_idx, Address(z, idx, Address::times_4,  offset));
 9004   rorq(yz_idx, 32); // convert big-endian to little-endian
 9005 
 9006   add2_with_carry(rdx, product, carry, yz_idx);
 9007 
 9008   movl(Address(z, idx, Address::times_4,  offset+4), product);
 9009   shrq(product, 32);
 9010   movl(Address(z, idx, Address::times_4,  offset), product);
 9011 
 9012 }
 9013 
 9014 /**
 9015  * Multiply 128 bit by 128 bit. Unrolled inner loop.
 9016  */
 9017 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
 9018                                              Register yz_idx, Register idx, Register jdx,
 9019                                              Register carry, Register product,
 9020                                              Register carry2) {
 9021   //   jlong carry, x[], y[], z[];
 9022   //   int kdx = ystart+1;
 9023   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
 9024   //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
 9025   //     z[kdx+idx+1] = (jlong)product;
 9026   //     jlong carry2  = (jlong)(product >>> 64);
 9027   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
 9028   //     z[kdx+idx] = (jlong)product;
 9029   //     carry  = (jlong)(product >>> 64);
 9030   //   }
 9031   //   idx += 2;
 9032   //   if (idx > 0) {
 9033   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
 9034   //     z[kdx+idx] = (jlong)product;
 9035   //     carry  = (jlong)(product >>> 64);
 9036   //   }
 9037   //
 9038 
 9039   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
 9040 
 9041   movl(jdx, idx);
 9042   andl(jdx, 0xFFFFFFFC);
 9043   shrl(jdx, 2);
 9044 
 9045   bind(L_third_loop);
 9046   subl(jdx, 1);
 9047   jcc(Assembler::negative, L_third_loop_exit);
 9048   subl(idx, 4);
 9049 
 9050   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
 9051   movq(carry2, rdx);
 9052 
 9053   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
 9054   movq(carry, rdx);
 9055   jmp(L_third_loop);
 9056 
 9057   bind (L_third_loop_exit);
 9058 
 9059   andl (idx, 0x3);
 9060   jcc(Assembler::zero, L_post_third_loop_done);
 9061 
 9062   Label L_check_1;
 9063   subl(idx, 2);
 9064   jcc(Assembler::negative, L_check_1);
 9065 
 9066   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
 9067   movq(carry, rdx);
 9068 
 9069   bind (L_check_1);
 9070   addl (idx, 0x2);
 9071   andl (idx, 0x1);
 9072   subl(idx, 1);
 9073   jcc(Assembler::negative, L_post_third_loop_done);
 9074 
 9075   movl(yz_idx, Address(y, idx, Address::times_4,  0));
 9076   movq(product, x_xstart);
 9077   mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
 9078   movl(yz_idx, Address(z, idx, Address::times_4,  0));
 9079 
 9080   add2_with_carry(rdx, product, yz_idx, carry);
 9081 
 9082   movl(Address(z, idx, Address::times_4,  0), product);
 9083   shrq(product, 32);
 9084 
 9085   shlq(rdx, 32);
 9086   orq(product, rdx);
 9087   movq(carry, product);
 9088 
 9089   bind(L_post_third_loop_done);
 9090 }
 9091 
 9092 /**
 9093  * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
 9094  *
 9095  */
 9096 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
 9097                                                   Register carry, Register carry2,
 9098                                                   Register idx, Register jdx,
 9099                                                   Register yz_idx1, Register yz_idx2,
 9100                                                   Register tmp, Register tmp3, Register tmp4) {
 9101   assert(UseBMI2Instructions, "should be used only when BMI2 is available");
 9102 
 9103   //   jlong carry, x[], y[], z[];
 9104   //   int kdx = ystart+1;
 9105   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
 9106   //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
 9107   //     jlong carry2  = (jlong)(tmp3 >>> 64);
 9108   //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
 9109   //     carry  = (jlong)(tmp4 >>> 64);
 9110   //     z[kdx+idx+1] = (jlong)tmp3;
 9111   //     z[kdx+idx] = (jlong)tmp4;
 9112   //   }
 9113   //   idx += 2;
 9114   //   if (idx > 0) {
 9115   //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
 9116   //     z[kdx+idx] = (jlong)yz_idx1;
 9117   //     carry  = (jlong)(yz_idx1 >>> 64);
 9118   //   }
 9119   //
 9120 
 9121   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
 9122 
 9123   movl(jdx, idx);
 9124   andl(jdx, 0xFFFFFFFC);
 9125   shrl(jdx, 2);
 9126 
 9127   bind(L_third_loop);
 9128   subl(jdx, 1);
 9129   jcc(Assembler::negative, L_third_loop_exit);
 9130   subl(idx, 4);
 9131 
 9132   movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
 9133   rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
 9134   movq(yz_idx2, Address(y, idx, Address::times_4,  0));
 9135   rorxq(yz_idx2, yz_idx2, 32);
 9136 
 9137   mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
 9138   mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
 9139 
 9140   movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
 9141   rorxq(yz_idx1, yz_idx1, 32);
 9142   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
 9143   rorxq(yz_idx2, yz_idx2, 32);
 9144 
 9145   if (VM_Version::supports_adx()) {
 9146     adcxq(tmp3, carry);
 9147     adoxq(tmp3, yz_idx1);
 9148 
 9149     adcxq(tmp4, tmp);
 9150     adoxq(tmp4, yz_idx2);
 9151 
 9152     movl(carry, 0); // does not affect flags
 9153     adcxq(carry2, carry);
 9154     adoxq(carry2, carry);
 9155   } else {
 9156     add2_with_carry(tmp4, tmp3, carry, yz_idx1);
 9157     add2_with_carry(carry2, tmp4, tmp, yz_idx2);
 9158   }
 9159   movq(carry, carry2);
 9160 
 9161   movl(Address(z, idx, Address::times_4, 12), tmp3);
 9162   shrq(tmp3, 32);
 9163   movl(Address(z, idx, Address::times_4,  8), tmp3);
 9164 
 9165   movl(Address(z, idx, Address::times_4,  4), tmp4);
 9166   shrq(tmp4, 32);
 9167   movl(Address(z, idx, Address::times_4,  0), tmp4);
 9168 
 9169   jmp(L_third_loop);
 9170 
 9171   bind (L_third_loop_exit);
 9172 
 9173   andl (idx, 0x3);
 9174   jcc(Assembler::zero, L_post_third_loop_done);
 9175 
 9176   Label L_check_1;
 9177   subl(idx, 2);
 9178   jcc(Assembler::negative, L_check_1);
 9179 
 9180   movq(yz_idx1, Address(y, idx, Address::times_4,  0));
 9181   rorxq(yz_idx1, yz_idx1, 32);
 9182   mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
 9183   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
 9184   rorxq(yz_idx2, yz_idx2, 32);
 9185 
 9186   add2_with_carry(tmp4, tmp3, carry, yz_idx2);
 9187 
 9188   movl(Address(z, idx, Address::times_4,  4), tmp3);
 9189   shrq(tmp3, 32);
 9190   movl(Address(z, idx, Address::times_4,  0), tmp3);
 9191   movq(carry, tmp4);
 9192 
 9193   bind (L_check_1);
 9194   addl (idx, 0x2);
 9195   andl (idx, 0x1);
 9196   subl(idx, 1);
 9197   jcc(Assembler::negative, L_post_third_loop_done);
 9198   movl(tmp4, Address(y, idx, Address::times_4,  0));
 9199   mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
 9200   movl(tmp4, Address(z, idx, Address::times_4,  0));
 9201 
 9202   add2_with_carry(carry2, tmp3, tmp4, carry);
 9203 
 9204   movl(Address(z, idx, Address::times_4,  0), tmp3);
 9205   shrq(tmp3, 32);
 9206 
 9207   shlq(carry2, 32);
 9208   orq(tmp3, carry2);
 9209   movq(carry, tmp3);
 9210 
 9211   bind(L_post_third_loop_done);
 9212 }
 9213 
 9214 /**
 9215  * Code for BigInteger::multiplyToLen() instrinsic.
 9216  *
 9217  * rdi: x
 9218  * rax: xlen
 9219  * rsi: y
 9220  * rcx: ylen
 9221  * r8:  z
 9222  * r11: zlen
 9223  * r12: tmp1
 9224  * r13: tmp2
 9225  * r14: tmp3
 9226  * r15: tmp4
 9227  * rbx: tmp5
 9228  *
 9229  */
 9230 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
 9231                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
 9232   ShortBranchVerifier sbv(this);
 9233   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
 9234 
 9235   push(tmp1);
 9236   push(tmp2);
 9237   push(tmp3);
 9238   push(tmp4);
 9239   push(tmp5);
 9240 
 9241   push(xlen);
 9242   push(zlen);
 9243 
 9244   const Register idx = tmp1;
 9245   const Register kdx = tmp2;
 9246   const Register xstart = tmp3;
 9247 
 9248   const Register y_idx = tmp4;
 9249   const Register carry = tmp5;
 9250   const Register product  = xlen;
 9251   const Register x_xstart = zlen;  // reuse register
 9252 
 9253   // First Loop.
 9254   //
 9255   //  final static long LONG_MASK = 0xffffffffL;
 9256   //  int xstart = xlen - 1;
 9257   //  int ystart = ylen - 1;
 9258   //  long carry = 0;
 9259   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
 9260   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
 9261   //    z[kdx] = (int)product;
 9262   //    carry = product >>> 32;
 9263   //  }
 9264   //  z[xstart] = (int)carry;
 9265   //
 9266 
 9267   movl(idx, ylen);      // idx = ylen;
 9268   movl(kdx, zlen);      // kdx = xlen+ylen;
 9269   xorq(carry, carry);   // carry = 0;
 9270 
 9271   Label L_done;
 9272 
 9273   movl(xstart, xlen);
 9274   decrementl(xstart);
 9275   jcc(Assembler::negative, L_done);
 9276 
 9277   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
 9278 
 9279   Label L_second_loop;
 9280   testl(kdx, kdx);
 9281   jcc(Assembler::zero, L_second_loop);
 9282 
 9283   Label L_carry;
 9284   subl(kdx, 1);
 9285   jcc(Assembler::zero, L_carry);
 9286 
 9287   movl(Address(z, kdx, Address::times_4,  0), carry);
 9288   shrq(carry, 32);
 9289   subl(kdx, 1);
 9290 
 9291   bind(L_carry);
 9292   movl(Address(z, kdx, Address::times_4,  0), carry);
 9293 
 9294   // Second and third (nested) loops.
 9295   //
 9296   // for (int i = xstart-1; i >= 0; i--) { // Second loop
 9297   //   carry = 0;
 9298   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
 9299   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
 9300   //                    (z[k] & LONG_MASK) + carry;
 9301   //     z[k] = (int)product;
 9302   //     carry = product >>> 32;
 9303   //   }
 9304   //   z[i] = (int)carry;
 9305   // }
 9306   //
 9307   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
 9308 
 9309   const Register jdx = tmp1;
 9310 
 9311   bind(L_second_loop);
 9312   xorl(carry, carry);    // carry = 0;
 9313   movl(jdx, ylen);       // j = ystart+1
 9314 
 9315   subl(xstart, 1);       // i = xstart-1;
 9316   jcc(Assembler::negative, L_done);
 9317 
 9318   push (z);
 9319 
 9320   Label L_last_x;
 9321   lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
 9322   subl(xstart, 1);       // i = xstart-1;
 9323   jcc(Assembler::negative, L_last_x);
 9324 
 9325   if (UseBMI2Instructions) {
 9326     movq(rdx,  Address(x, xstart, Address::times_4,  0));
 9327     rorxq(rdx, rdx, 32); // convert big-endian to little-endian
 9328   } else {
 9329     movq(x_xstart, Address(x, xstart, Address::times_4,  0));
 9330     rorq(x_xstart, 32);  // convert big-endian to little-endian
 9331   }
 9332 
 9333   Label L_third_loop_prologue;
 9334   bind(L_third_loop_prologue);
 9335 
 9336   push (x);
 9337   push (xstart);
 9338   push (ylen);
 9339 
 9340 
 9341   if (UseBMI2Instructions) {
 9342     multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
 9343   } else { // !UseBMI2Instructions
 9344     multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
 9345   }
 9346 
 9347   pop(ylen);
 9348   pop(xlen);
 9349   pop(x);
 9350   pop(z);
 9351 
 9352   movl(tmp3, xlen);
 9353   addl(tmp3, 1);
 9354   movl(Address(z, tmp3, Address::times_4,  0), carry);
 9355   subl(tmp3, 1);
 9356   jccb(Assembler::negative, L_done);
 9357 
 9358   shrq(carry, 32);
 9359   movl(Address(z, tmp3, Address::times_4,  0), carry);
 9360   jmp(L_second_loop);
 9361 
 9362   // Next infrequent code is moved outside loops.
 9363   bind(L_last_x);
 9364   if (UseBMI2Instructions) {
 9365     movl(rdx, Address(x,  0));
 9366   } else {
 9367     movl(x_xstart, Address(x,  0));
 9368   }
 9369   jmp(L_third_loop_prologue);
 9370 
 9371   bind(L_done);
 9372 
 9373   pop(zlen);
 9374   pop(xlen);
 9375 
 9376   pop(tmp5);
 9377   pop(tmp4);
 9378   pop(tmp3);
 9379   pop(tmp2);
 9380   pop(tmp1);
 9381 }
 9382 
 9383 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
 9384   Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
 9385   assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
 9386   Label VECTOR64_LOOP, VECTOR64_TAIL, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
 9387   Label VECTOR32_LOOP, VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
 9388   Label VECTOR16_TAIL, VECTOR8_TAIL, VECTOR4_TAIL;
 9389   Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
 9390   Label SAME_TILL_END, DONE;
 9391   Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
 9392 
 9393   //scale is in rcx in both Win64 and Unix
 9394   ShortBranchVerifier sbv(this);
 9395 
 9396   shlq(length);
 9397   xorq(result, result);
 9398 
 9399   if ((UseAVX > 2) &&
 9400       VM_Version::supports_avx512vlbw()) {
 9401     set_vector_masking();  // opening of the stub context for programming mask registers
 9402     cmpq(length, 64);
 9403     jcc(Assembler::less, VECTOR32_TAIL);
 9404     movq(tmp1, length);
 9405     andq(tmp1, 0x3F);      // tail count
 9406     andq(length, ~(0x3F)); //vector count
 9407 
 9408     bind(VECTOR64_LOOP);
 9409     // AVX512 code to compare 64 byte vectors.
 9410     evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
 9411     evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
 9412     kortestql(k7, k7);
 9413     jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL);     // mismatch
 9414     addq(result, 64);
 9415     subq(length, 64);
 9416     jccb(Assembler::notZero, VECTOR64_LOOP);
 9417 
 9418     //bind(VECTOR64_TAIL);
 9419     testq(tmp1, tmp1);
 9420     jcc(Assembler::zero, SAME_TILL_END);
 9421 
 9422     bind(VECTOR64_TAIL);
 9423     // AVX512 code to compare upto 63 byte vectors.
 9424     // Save k1
 9425     kmovql(k3, k1);
 9426     mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
 9427     shlxq(tmp2, tmp2, tmp1);
 9428     notq(tmp2);
 9429     kmovql(k1, tmp2);
 9430 
 9431     evmovdqub(rymm0, k1, Address(obja, result), Assembler::AVX_512bit);
 9432     evpcmpeqb(k7, k1, rymm0, Address(objb, result), Assembler::AVX_512bit);
 9433 
 9434     ktestql(k7, k1);
 9435     // Restore k1
 9436     kmovql(k1, k3);
 9437     jcc(Assembler::below, SAME_TILL_END);     // not mismatch
 9438 
 9439     bind(VECTOR64_NOT_EQUAL);
 9440     kmovql(tmp1, k7);
 9441     notq(tmp1);
 9442     tzcntq(tmp1, tmp1);
 9443     addq(result, tmp1);
 9444     shrq(result);
 9445     jmp(DONE);
 9446     bind(VECTOR32_TAIL);
 9447     clear_vector_masking();   // closing of the stub context for programming mask registers
 9448   }
 9449 
 9450   cmpq(length, 8);
 9451   jcc(Assembler::equal, VECTOR8_LOOP);
 9452   jcc(Assembler::less, VECTOR4_TAIL);
 9453 
 9454   if (UseAVX >= 2) {
 9455 
 9456     cmpq(length, 16);
 9457     jcc(Assembler::equal, VECTOR16_LOOP);
 9458     jcc(Assembler::less, VECTOR8_LOOP);
 9459 
 9460     cmpq(length, 32);
 9461     jccb(Assembler::less, VECTOR16_TAIL);
 9462 
 9463     subq(length, 32);
 9464     bind(VECTOR32_LOOP);
 9465     vmovdqu(rymm0, Address(obja, result));
 9466     vmovdqu(rymm1, Address(objb, result));
 9467     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
 9468     vptest(rymm2, rymm2);
 9469     jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
 9470     addq(result, 32);
 9471     subq(length, 32);
 9472     jccb(Assembler::greaterEqual, VECTOR32_LOOP);
 9473     addq(length, 32);
 9474     jcc(Assembler::equal, SAME_TILL_END);
 9475     //falling through if less than 32 bytes left //close the branch here.
 9476 
 9477     bind(VECTOR16_TAIL);
 9478     cmpq(length, 16);
 9479     jccb(Assembler::less, VECTOR8_TAIL);
 9480     bind(VECTOR16_LOOP);
 9481     movdqu(rymm0, Address(obja, result));
 9482     movdqu(rymm1, Address(objb, result));
 9483     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
 9484     ptest(rymm2, rymm2);
 9485     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
 9486     addq(result, 16);
 9487     subq(length, 16);
 9488     jcc(Assembler::equal, SAME_TILL_END);
 9489     //falling through if less than 16 bytes left
 9490   } else {//regular intrinsics
 9491 
 9492     cmpq(length, 16);
 9493     jccb(Assembler::less, VECTOR8_TAIL);
 9494 
 9495     subq(length, 16);
 9496     bind(VECTOR16_LOOP);
 9497     movdqu(rymm0, Address(obja, result));
 9498     movdqu(rymm1, Address(objb, result));
 9499     pxor(rymm0, rymm1);
 9500     ptest(rymm0, rymm0);
 9501     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
 9502     addq(result, 16);
 9503     subq(length, 16);
 9504     jccb(Assembler::greaterEqual, VECTOR16_LOOP);
 9505     addq(length, 16);
 9506     jcc(Assembler::equal, SAME_TILL_END);
 9507     //falling through if less than 16 bytes left
 9508   }
 9509 
 9510   bind(VECTOR8_TAIL);
 9511   cmpq(length, 8);
 9512   jccb(Assembler::less, VECTOR4_TAIL);
 9513   bind(VECTOR8_LOOP);
 9514   movq(tmp1, Address(obja, result));
 9515   movq(tmp2, Address(objb, result));
 9516   xorq(tmp1, tmp2);
 9517   testq(tmp1, tmp1);
 9518   jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
 9519   addq(result, 8);
 9520   subq(length, 8);
 9521   jcc(Assembler::equal, SAME_TILL_END);
 9522   //falling through if less than 8 bytes left
 9523 
 9524   bind(VECTOR4_TAIL);
 9525   cmpq(length, 4);
 9526   jccb(Assembler::less, BYTES_TAIL);
 9527   bind(VECTOR4_LOOP);
 9528   movl(tmp1, Address(obja, result));
 9529   xorl(tmp1, Address(objb, result));
 9530   testl(tmp1, tmp1);
 9531   jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
 9532   addq(result, 4);
 9533   subq(length, 4);
 9534   jcc(Assembler::equal, SAME_TILL_END);
 9535   //falling through if less than 4 bytes left
 9536 
 9537   bind(BYTES_TAIL);
 9538   bind(BYTES_LOOP);
 9539   load_unsigned_byte(tmp1, Address(obja, result));
 9540   load_unsigned_byte(tmp2, Address(objb, result));
 9541   xorl(tmp1, tmp2);
 9542   testl(tmp1, tmp1);
 9543   jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
 9544   decq(length);
 9545   jccb(Assembler::zero, SAME_TILL_END);
 9546   incq(result);
 9547   load_unsigned_byte(tmp1, Address(obja, result));
 9548   load_unsigned_byte(tmp2, Address(objb, result));
 9549   xorl(tmp1, tmp2);
 9550   testl(tmp1, tmp1);
 9551   jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
 9552   decq(length);
 9553   jccb(Assembler::zero, SAME_TILL_END);
 9554   incq(result);
 9555   load_unsigned_byte(tmp1, Address(obja, result));
 9556   load_unsigned_byte(tmp2, Address(objb, result));
 9557   xorl(tmp1, tmp2);
 9558   testl(tmp1, tmp1);
 9559   jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
 9560   jmpb(SAME_TILL_END);
 9561 
 9562   if (UseAVX >= 2) {
 9563     bind(VECTOR32_NOT_EQUAL);
 9564     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
 9565     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
 9566     vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
 9567     vpmovmskb(tmp1, rymm0);
 9568     bsfq(tmp1, tmp1);
 9569     addq(result, tmp1);
 9570     shrq(result);
 9571     jmpb(DONE);
 9572   }
 9573 
 9574   bind(VECTOR16_NOT_EQUAL);
 9575   if (UseAVX >= 2) {
 9576     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
 9577     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
 9578     pxor(rymm0, rymm2);
 9579   } else {
 9580     pcmpeqb(rymm2, rymm2);
 9581     pxor(rymm0, rymm1);
 9582     pcmpeqb(rymm0, rymm1);
 9583     pxor(rymm0, rymm2);
 9584   }
 9585   pmovmskb(tmp1, rymm0);
 9586   bsfq(tmp1, tmp1);
 9587   addq(result, tmp1);
 9588   shrq(result);
 9589   jmpb(DONE);
 9590 
 9591   bind(VECTOR8_NOT_EQUAL);
 9592   bind(VECTOR4_NOT_EQUAL);
 9593   bsfq(tmp1, tmp1);
 9594   shrq(tmp1, 3);
 9595   addq(result, tmp1);
 9596   bind(BYTES_NOT_EQUAL);
 9597   shrq(result);
 9598   jmpb(DONE);
 9599 
 9600   bind(SAME_TILL_END);
 9601   mov64(result, -1);
 9602 
 9603   bind(DONE);
 9604 }
 9605 
 9606 //Helper functions for square_to_len()
 9607 
 9608 /**
 9609  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
 9610  * Preserves x and z and modifies rest of the registers.
 9611  */
 9612 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
 9613   // Perform square and right shift by 1
 9614   // Handle odd xlen case first, then for even xlen do the following
 9615   // jlong carry = 0;
 9616   // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
 9617   //     huge_128 product = x[j:j+1] * x[j:j+1];
 9618   //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
 9619   //     z[i+2:i+3] = (jlong)(product >>> 1);
 9620   //     carry = (jlong)product;
 9621   // }
 9622 
 9623   xorq(tmp5, tmp5);     // carry
 9624   xorq(rdxReg, rdxReg);
 9625   xorl(tmp1, tmp1);     // index for x
 9626   xorl(tmp4, tmp4);     // index for z
 9627 
 9628   Label L_first_loop, L_first_loop_exit;
 9629 
 9630   testl(xlen, 1);
 9631   jccb(Assembler::zero, L_first_loop); //jump if xlen is even
 9632 
 9633   // Square and right shift by 1 the odd element using 32 bit multiply
 9634   movl(raxReg, Address(x, tmp1, Address::times_4, 0));
 9635   imulq(raxReg, raxReg);
 9636   shrq(raxReg, 1);
 9637   adcq(tmp5, 0);
 9638   movq(Address(z, tmp4, Address::times_4, 0), raxReg);
 9639   incrementl(tmp1);
 9640   addl(tmp4, 2);
 9641 
 9642   // Square and  right shift by 1 the rest using 64 bit multiply
 9643   bind(L_first_loop);
 9644   cmpptr(tmp1, xlen);
 9645   jccb(Assembler::equal, L_first_loop_exit);
 9646 
 9647   // Square
 9648   movq(raxReg, Address(x, tmp1, Address::times_4,  0));
 9649   rorq(raxReg, 32);    // convert big-endian to little-endian
 9650   mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
 9651 
 9652   // Right shift by 1 and save carry
 9653   shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
 9654   rcrq(rdxReg, 1);
 9655   rcrq(raxReg, 1);
 9656   adcq(tmp5, 0);
 9657 
 9658   // Store result in z
 9659   movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
 9660   movq(Address(z, tmp4, Address::times_4, 8), raxReg);
 9661 
 9662   // Update indices for x and z
 9663   addl(tmp1, 2);
 9664   addl(tmp4, 4);
 9665   jmp(L_first_loop);
 9666 
 9667   bind(L_first_loop_exit);
 9668 }
 9669 
 9670 
 9671 /**
 9672  * Perform the following multiply add operation using BMI2 instructions
 9673  * carry:sum = sum + op1*op2 + carry
 9674  * op2 should be in rdx
 9675  * op2 is preserved, all other registers are modified
 9676  */
 9677 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
 9678   // assert op2 is rdx
 9679   mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
 9680   addq(sum, carry);
 9681   adcq(tmp2, 0);
 9682   addq(sum, op1);
 9683   adcq(tmp2, 0);
 9684   movq(carry, tmp2);
 9685 }
 9686 
 9687 /**
 9688  * Perform the following multiply add operation:
 9689  * carry:sum = sum + op1*op2 + carry
 9690  * Preserves op1, op2 and modifies rest of registers
 9691  */
 9692 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
 9693   // rdx:rax = op1 * op2
 9694   movq(raxReg, op2);
 9695   mulq(op1);
 9696 
 9697   //  rdx:rax = sum + carry + rdx:rax
 9698   addq(sum, carry);
 9699   adcq(rdxReg, 0);
 9700   addq(sum, raxReg);
 9701   adcq(rdxReg, 0);
 9702 
 9703   // carry:sum = rdx:sum
 9704   movq(carry, rdxReg);
 9705 }
 9706 
 9707 /**
 9708  * Add 64 bit long carry into z[] with carry propogation.
 9709  * Preserves z and carry register values and modifies rest of registers.
 9710  *
 9711  */
 9712 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
 9713   Label L_fourth_loop, L_fourth_loop_exit;
 9714 
 9715   movl(tmp1, 1);
 9716   subl(zlen, 2);
 9717   addq(Address(z, zlen, Address::times_4, 0), carry);
 9718 
 9719   bind(L_fourth_loop);
 9720   jccb(Assembler::carryClear, L_fourth_loop_exit);
 9721   subl(zlen, 2);
 9722   jccb(Assembler::negative, L_fourth_loop_exit);
 9723   addq(Address(z, zlen, Address::times_4, 0), tmp1);
 9724   jmp(L_fourth_loop);
 9725   bind(L_fourth_loop_exit);
 9726 }
 9727 
 9728 /**
 9729  * Shift z[] left by 1 bit.
 9730  * Preserves x, len, z and zlen registers and modifies rest of the registers.
 9731  *
 9732  */
 9733 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
 9734 
 9735   Label L_fifth_loop, L_fifth_loop_exit;
 9736 
 9737   // Fifth loop
 9738   // Perform primitiveLeftShift(z, zlen, 1)
 9739 
 9740   const Register prev_carry = tmp1;
 9741   const Register new_carry = tmp4;
 9742   const Register value = tmp2;
 9743   const Register zidx = tmp3;
 9744 
 9745   // int zidx, carry;
 9746   // long value;
 9747   // carry = 0;
 9748   // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
 9749   //    (carry:value)  = (z[i] << 1) | carry ;
 9750   //    z[i] = value;
 9751   // }
 9752 
 9753   movl(zidx, zlen);
 9754   xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
 9755 
 9756   bind(L_fifth_loop);
 9757   decl(zidx);  // Use decl to preserve carry flag
 9758   decl(zidx);
 9759   jccb(Assembler::negative, L_fifth_loop_exit);
 9760 
 9761   if (UseBMI2Instructions) {
 9762      movq(value, Address(z, zidx, Address::times_4, 0));
 9763      rclq(value, 1);
 9764      rorxq(value, value, 32);
 9765      movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
 9766   }
 9767   else {
 9768     // clear new_carry
 9769     xorl(new_carry, new_carry);
 9770 
 9771     // Shift z[i] by 1, or in previous carry and save new carry
 9772     movq(value, Address(z, zidx, Address::times_4, 0));
 9773     shlq(value, 1);
 9774     adcl(new_carry, 0);
 9775 
 9776     orq(value, prev_carry);
 9777     rorq(value, 0x20);
 9778     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
 9779 
 9780     // Set previous carry = new carry
 9781     movl(prev_carry, new_carry);
 9782   }
 9783   jmp(L_fifth_loop);
 9784 
 9785   bind(L_fifth_loop_exit);
 9786 }
 9787 
 9788 
 9789 /**
 9790  * Code for BigInteger::squareToLen() intrinsic
 9791  *
 9792  * rdi: x
 9793  * rsi: len
 9794  * r8:  z
 9795  * rcx: zlen
 9796  * r12: tmp1
 9797  * r13: tmp2
 9798  * r14: tmp3
 9799  * r15: tmp4
 9800  * rbx: tmp5
 9801  *
 9802  */
 9803 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
 9804 
 9805   Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply;
 9806   push(tmp1);
 9807   push(tmp2);
 9808   push(tmp3);
 9809   push(tmp4);
 9810   push(tmp5);
 9811 
 9812   // First loop
 9813   // Store the squares, right shifted one bit (i.e., divided by 2).
 9814   square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
 9815 
 9816   // Add in off-diagonal sums.
 9817   //
 9818   // Second, third (nested) and fourth loops.
 9819   // zlen +=2;
 9820   // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
 9821   //    carry = 0;
 9822   //    long op2 = x[xidx:xidx+1];
 9823   //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
 9824   //       k -= 2;
 9825   //       long op1 = x[j:j+1];
 9826   //       long sum = z[k:k+1];
 9827   //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
 9828   //       z[k:k+1] = sum;
 9829   //    }
 9830   //    add_one_64(z, k, carry, tmp_regs);
 9831   // }
 9832 
 9833   const Register carry = tmp5;
 9834   const Register sum = tmp3;
 9835   const Register op1 = tmp4;
 9836   Register op2 = tmp2;
 9837 
 9838   push(zlen);
 9839   push(len);
 9840   addl(zlen,2);
 9841   bind(L_second_loop);
 9842   xorq(carry, carry);
 9843   subl(zlen, 4);
 9844   subl(len, 2);
 9845   push(zlen);
 9846   push(len);
 9847   cmpl(len, 0);
 9848   jccb(Assembler::lessEqual, L_second_loop_exit);
 9849 
 9850   // Multiply an array by one 64 bit long.
 9851   if (UseBMI2Instructions) {
 9852     op2 = rdxReg;
 9853     movq(op2, Address(x, len, Address::times_4,  0));
 9854     rorxq(op2, op2, 32);
 9855   }
 9856   else {
 9857     movq(op2, Address(x, len, Address::times_4,  0));
 9858     rorq(op2, 32);
 9859   }
 9860 
 9861   bind(L_third_loop);
 9862   decrementl(len);
 9863   jccb(Assembler::negative, L_third_loop_exit);
 9864   decrementl(len);
 9865   jccb(Assembler::negative, L_last_x);
 9866 
 9867   movq(op1, Address(x, len, Address::times_4,  0));
 9868   rorq(op1, 32);
 9869 
 9870   bind(L_multiply);
 9871   subl(zlen, 2);
 9872   movq(sum, Address(z, zlen, Address::times_4,  0));
 9873 
 9874   // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
 9875   if (UseBMI2Instructions) {
 9876     multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
 9877   }
 9878   else {
 9879     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
 9880   }
 9881 
 9882   movq(Address(z, zlen, Address::times_4, 0), sum);
 9883 
 9884   jmp(L_third_loop);
 9885   bind(L_third_loop_exit);
 9886 
 9887   // Fourth loop
 9888   // Add 64 bit long carry into z with carry propogation.
 9889   // Uses offsetted zlen.
 9890   add_one_64(z, zlen, carry, tmp1);
 9891 
 9892   pop(len);
 9893   pop(zlen);
 9894   jmp(L_second_loop);
 9895 
 9896   // Next infrequent code is moved outside loops.
 9897   bind(L_last_x);
 9898   movl(op1, Address(x, 0));
 9899   jmp(L_multiply);
 9900 
 9901   bind(L_second_loop_exit);
 9902   pop(len);
 9903   pop(zlen);
 9904   pop(len);
 9905   pop(zlen);
 9906 
 9907   // Fifth loop
 9908   // Shift z left 1 bit.
 9909   lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
 9910 
 9911   // z[zlen-1] |= x[len-1] & 1;
 9912   movl(tmp3, Address(x, len, Address::times_4, -4));
 9913   andl(tmp3, 1);
 9914   orl(Address(z, zlen, Address::times_4,  -4), tmp3);
 9915 
 9916   pop(tmp5);
 9917   pop(tmp4);
 9918   pop(tmp3);
 9919   pop(tmp2);
 9920   pop(tmp1);
 9921 }
 9922 
 9923 /**
 9924  * Helper function for mul_add()
 9925  * Multiply the in[] by int k and add to out[] starting at offset offs using
 9926  * 128 bit by 32 bit multiply and return the carry in tmp5.
 9927  * Only quad int aligned length of in[] is operated on in this function.
 9928  * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
 9929  * This function preserves out, in and k registers.
 9930  * len and offset point to the appropriate index in "in" & "out" correspondingly
 9931  * tmp5 has the carry.
 9932  * other registers are temporary and are modified.
 9933  *
 9934  */
 9935 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
 9936   Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
 9937   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
 9938 
 9939   Label L_first_loop, L_first_loop_exit;
 9940 
 9941   movl(tmp1, len);
 9942   shrl(tmp1, 2);
 9943 
 9944   bind(L_first_loop);
 9945   subl(tmp1, 1);
 9946   jccb(Assembler::negative, L_first_loop_exit);
 9947 
 9948   subl(len, 4);
 9949   subl(offset, 4);
 9950 
 9951   Register op2 = tmp2;
 9952   const Register sum = tmp3;
 9953   const Register op1 = tmp4;
 9954   const Register carry = tmp5;
 9955 
 9956   if (UseBMI2Instructions) {
 9957     op2 = rdxReg;
 9958   }
 9959 
 9960   movq(op1, Address(in, len, Address::times_4,  8));
 9961   rorq(op1, 32);
 9962   movq(sum, Address(out, offset, Address::times_4,  8));
 9963   rorq(sum, 32);
 9964   if (UseBMI2Instructions) {
 9965     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
 9966   }
 9967   else {
 9968     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
 9969   }
 9970   // Store back in big endian from little endian
 9971   rorq(sum, 0x20);
 9972   movq(Address(out, offset, Address::times_4,  8), sum);
 9973 
 9974   movq(op1, Address(in, len, Address::times_4,  0));
 9975   rorq(op1, 32);
 9976   movq(sum, Address(out, offset, Address::times_4,  0));
 9977   rorq(sum, 32);
 9978   if (UseBMI2Instructions) {
 9979     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
 9980   }
 9981   else {
 9982     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
 9983   }
 9984   // Store back in big endian from little endian
 9985   rorq(sum, 0x20);
 9986   movq(Address(out, offset, Address::times_4,  0), sum);
 9987 
 9988   jmp(L_first_loop);
 9989   bind(L_first_loop_exit);
 9990 }
 9991 
 9992 /**
 9993  * Code for BigInteger::mulAdd() intrinsic
 9994  *
 9995  * rdi: out
 9996  * rsi: in
 9997  * r11: offs (out.length - offset)
 9998  * rcx: len
 9999  * r8:  k
10000  * r12: tmp1
10001  * r13: tmp2
10002  * r14: tmp3
10003  * r15: tmp4
10004  * rbx: tmp5
10005  * Multiply the in[] by word k and add to out[], return the carry in rax
10006  */
10007 void MacroAssembler::mul_add(Register out, Register in, Register offs,
10008    Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
10009    Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
10010 
10011   Label L_carry, L_last_in, L_done;
10012 
10013 // carry = 0;
10014 // for (int j=len-1; j >= 0; j--) {
10015 //    long product = (in[j] & LONG_MASK) * kLong +
10016 //                   (out[offs] & LONG_MASK) + carry;
10017 //    out[offs--] = (int)product;
10018 //    carry = product >>> 32;
10019 // }
10020 //
10021   push(tmp1);
10022   push(tmp2);
10023   push(tmp3);
10024   push(tmp4);
10025   push(tmp5);
10026 
10027   Register op2 = tmp2;
10028   const Register sum = tmp3;
10029   const Register op1 = tmp4;
10030   const Register carry =  tmp5;
10031 
10032   if (UseBMI2Instructions) {
10033     op2 = rdxReg;
10034     movl(op2, k);
10035   }
10036   else {
10037     movl(op2, k);
10038   }
10039 
10040   xorq(carry, carry);
10041 
10042   //First loop
10043 
10044   //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
10045   //The carry is in tmp5
10046   mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
10047 
10048   //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
10049   decrementl(len);
10050   jccb(Assembler::negative, L_carry);
10051   decrementl(len);
10052   jccb(Assembler::negative, L_last_in);
10053 
10054   movq(op1, Address(in, len, Address::times_4,  0));
10055   rorq(op1, 32);
10056 
10057   subl(offs, 2);
10058   movq(sum, Address(out, offs, Address::times_4,  0));
10059   rorq(sum, 32);
10060 
10061   if (UseBMI2Instructions) {
10062     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
10063   }
10064   else {
10065     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
10066   }
10067 
10068   // Store back in big endian from little endian
10069   rorq(sum, 0x20);
10070   movq(Address(out, offs, Address::times_4,  0), sum);
10071 
10072   testl(len, len);
10073   jccb(Assembler::zero, L_carry);
10074 
10075   //Multiply the last in[] entry, if any
10076   bind(L_last_in);
10077   movl(op1, Address(in, 0));
10078   movl(sum, Address(out, offs, Address::times_4,  -4));
10079 
10080   movl(raxReg, k);
10081   mull(op1); //tmp4 * eax -> edx:eax
10082   addl(sum, carry);
10083   adcl(rdxReg, 0);
10084   addl(sum, raxReg);
10085   adcl(rdxReg, 0);
10086   movl(carry, rdxReg);
10087 
10088   movl(Address(out, offs, Address::times_4,  -4), sum);
10089 
10090   bind(L_carry);
10091   //return tmp5/carry as carry in rax
10092   movl(rax, carry);
10093 
10094   bind(L_done);
10095   pop(tmp5);
10096   pop(tmp4);
10097   pop(tmp3);
10098   pop(tmp2);
10099   pop(tmp1);
10100 }
10101 #endif
10102 
10103 /**
10104  * Emits code to update CRC-32 with a byte value according to constants in table
10105  *
10106  * @param [in,out]crc   Register containing the crc.
10107  * @param [in]val       Register containing the byte to fold into the CRC.
10108  * @param [in]table     Register containing the table of crc constants.
10109  *
10110  * uint32_t crc;
10111  * val = crc_table[(val ^ crc) & 0xFF];
10112  * crc = val ^ (crc >> 8);
10113  *
10114  */
10115 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
10116   xorl(val, crc);
10117   andl(val, 0xFF);
10118   shrl(crc, 8); // unsigned shift
10119   xorl(crc, Address(table, val, Address::times_4, 0));
10120 }
10121 
10122 /**
10123  * Fold 128-bit data chunk
10124  */
10125 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
10126   if (UseAVX > 0) {
10127     vpclmulhdq(xtmp, xK, xcrc); // [123:64]
10128     vpclmulldq(xcrc, xK, xcrc); // [63:0]
10129     vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
10130     pxor(xcrc, xtmp);
10131   } else {
10132     movdqa(xtmp, xcrc);
10133     pclmulhdq(xtmp, xK);   // [123:64]
10134     pclmulldq(xcrc, xK);   // [63:0]
10135     pxor(xcrc, xtmp);
10136     movdqu(xtmp, Address(buf, offset));
10137     pxor(xcrc, xtmp);
10138   }
10139 }
10140 
10141 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
10142   if (UseAVX > 0) {
10143     vpclmulhdq(xtmp, xK, xcrc);
10144     vpclmulldq(xcrc, xK, xcrc);
10145     pxor(xcrc, xbuf);
10146     pxor(xcrc, xtmp);
10147   } else {
10148     movdqa(xtmp, xcrc);
10149     pclmulhdq(xtmp, xK);
10150     pclmulldq(xcrc, xK);
10151     pxor(xcrc, xbuf);
10152     pxor(xcrc, xtmp);
10153   }
10154 }
10155 
10156 /**
10157  * 8-bit folds to compute 32-bit CRC
10158  *
10159  * uint64_t xcrc;
10160  * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
10161  */
10162 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
10163   movdl(tmp, xcrc);
10164   andl(tmp, 0xFF);
10165   movdl(xtmp, Address(table, tmp, Address::times_4, 0));
10166   psrldq(xcrc, 1); // unsigned shift one byte
10167   pxor(xcrc, xtmp);
10168 }
10169 
10170 /**
10171  * uint32_t crc;
10172  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
10173  */
10174 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
10175   movl(tmp, crc);
10176   andl(tmp, 0xFF);
10177   shrl(crc, 8);
10178   xorl(crc, Address(table, tmp, Address::times_4, 0));
10179 }
10180 
10181 /**
10182  * @param crc   register containing existing CRC (32-bit)
10183  * @param buf   register pointing to input byte buffer (byte*)
10184  * @param len   register containing number of bytes
10185  * @param table register that will contain address of CRC table
10186  * @param tmp   scratch register
10187  */
10188 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
10189   assert_different_registers(crc, buf, len, table, tmp, rax);
10190 
10191   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
10192   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
10193 
10194   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
10195   // context for the registers used, where all instructions below are using 128-bit mode
10196   // On EVEX without VL and BW, these instructions will all be AVX.
10197   if (VM_Version::supports_avx512vlbw()) {
10198     movl(tmp, 0xffff);
10199     kmovwl(k1, tmp);
10200   }
10201 
10202   lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
10203   notl(crc); // ~crc
10204   cmpl(len, 16);
10205   jcc(Assembler::less, L_tail);
10206 
10207   // Align buffer to 16 bytes
10208   movl(tmp, buf);
10209   andl(tmp, 0xF);
10210   jccb(Assembler::zero, L_aligned);
10211   subl(tmp,  16);
10212   addl(len, tmp);
10213 
10214   align(4);
10215   BIND(L_align_loop);
10216   movsbl(rax, Address(buf, 0)); // load byte with sign extension
10217   update_byte_crc32(crc, rax, table);
10218   increment(buf);
10219   incrementl(tmp);
10220   jccb(Assembler::less, L_align_loop);
10221 
10222   BIND(L_aligned);
10223   movl(tmp, len); // save
10224   shrl(len, 4);
10225   jcc(Assembler::zero, L_tail_restore);
10226 
10227   // Fold crc into first bytes of vector
10228   movdqa(xmm1, Address(buf, 0));
10229   movdl(rax, xmm1);
10230   xorl(crc, rax);
10231   if (VM_Version::supports_sse4_1()) {
10232     pinsrd(xmm1, crc, 0);
10233   } else {
10234     pinsrw(xmm1, crc, 0);
10235     shrl(crc, 16);
10236     pinsrw(xmm1, crc, 1);
10237   }
10238   addptr(buf, 16);
10239   subl(len, 4); // len > 0
10240   jcc(Assembler::less, L_fold_tail);
10241 
10242   movdqa(xmm2, Address(buf,  0));
10243   movdqa(xmm3, Address(buf, 16));
10244   movdqa(xmm4, Address(buf, 32));
10245   addptr(buf, 48);
10246   subl(len, 3);
10247   jcc(Assembler::lessEqual, L_fold_512b);
10248 
10249   // Fold total 512 bits of polynomial on each iteration,
10250   // 128 bits per each of 4 parallel streams.
10251   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
10252 
10253   align(32);
10254   BIND(L_fold_512b_loop);
10255   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
10256   fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
10257   fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
10258   fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
10259   addptr(buf, 64);
10260   subl(len, 4);
10261   jcc(Assembler::greater, L_fold_512b_loop);
10262 
10263   // Fold 512 bits to 128 bits.
10264   BIND(L_fold_512b);
10265   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
10266   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
10267   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
10268   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
10269 
10270   // Fold the rest of 128 bits data chunks
10271   BIND(L_fold_tail);
10272   addl(len, 3);
10273   jccb(Assembler::lessEqual, L_fold_128b);
10274   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
10275 
10276   BIND(L_fold_tail_loop);
10277   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
10278   addptr(buf, 16);
10279   decrementl(len);
10280   jccb(Assembler::greater, L_fold_tail_loop);
10281 
10282   // Fold 128 bits in xmm1 down into 32 bits in crc register.
10283   BIND(L_fold_128b);
10284   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
10285   if (UseAVX > 0) {
10286     vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
10287     vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
10288     vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
10289   } else {
10290     movdqa(xmm2, xmm0);
10291     pclmulqdq(xmm2, xmm1, 0x1);
10292     movdqa(xmm3, xmm0);
10293     pand(xmm3, xmm2);
10294     pclmulqdq(xmm0, xmm3, 0x1);
10295   }
10296   psrldq(xmm1, 8);
10297   psrldq(xmm2, 4);
10298   pxor(xmm0, xmm1);
10299   pxor(xmm0, xmm2);
10300 
10301   // 8 8-bit folds to compute 32-bit CRC.
10302   for (int j = 0; j < 4; j++) {
10303     fold_8bit_crc32(xmm0, table, xmm1, rax);
10304   }
10305   movdl(crc, xmm0); // mov 32 bits to general register
10306   for (int j = 0; j < 4; j++) {
10307     fold_8bit_crc32(crc, table, rax);
10308   }
10309 
10310   BIND(L_tail_restore);
10311   movl(len, tmp); // restore
10312   BIND(L_tail);
10313   andl(len, 0xf);
10314   jccb(Assembler::zero, L_exit);
10315 
10316   // Fold the rest of bytes
10317   align(4);
10318   BIND(L_tail_loop);
10319   movsbl(rax, Address(buf, 0)); // load byte with sign extension
10320   update_byte_crc32(crc, rax, table);
10321   increment(buf);
10322   decrementl(len);
10323   jccb(Assembler::greater, L_tail_loop);
10324 
10325   BIND(L_exit);
10326   notl(crc); // ~c
10327 }
10328 
10329 #ifdef _LP64
10330 // S. Gueron / Information Processing Letters 112 (2012) 184
10331 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
10332 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
10333 // Output: the 64-bit carry-less product of B * CONST
10334 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
10335                                      Register tmp1, Register tmp2, Register tmp3) {
10336   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
10337   if (n > 0) {
10338     addq(tmp3, n * 256 * 8);
10339   }
10340   //    Q1 = TABLEExt[n][B & 0xFF];
10341   movl(tmp1, in);
10342   andl(tmp1, 0x000000FF);
10343   shll(tmp1, 3);
10344   addq(tmp1, tmp3);
10345   movq(tmp1, Address(tmp1, 0));
10346 
10347   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
10348   movl(tmp2, in);
10349   shrl(tmp2, 8);
10350   andl(tmp2, 0x000000FF);
10351   shll(tmp2, 3);
10352   addq(tmp2, tmp3);
10353   movq(tmp2, Address(tmp2, 0));
10354 
10355   shlq(tmp2, 8);
10356   xorq(tmp1, tmp2);
10357 
10358   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
10359   movl(tmp2, in);
10360   shrl(tmp2, 16);
10361   andl(tmp2, 0x000000FF);
10362   shll(tmp2, 3);
10363   addq(tmp2, tmp3);
10364   movq(tmp2, Address(tmp2, 0));
10365 
10366   shlq(tmp2, 16);
10367   xorq(tmp1, tmp2);
10368 
10369   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
10370   shrl(in, 24);
10371   andl(in, 0x000000FF);
10372   shll(in, 3);
10373   addq(in, tmp3);
10374   movq(in, Address(in, 0));
10375 
10376   shlq(in, 24);
10377   xorq(in, tmp1);
10378   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
10379 }
10380 
10381 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
10382                                       Register in_out,
10383                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
10384                                       XMMRegister w_xtmp2,
10385                                       Register tmp1,
10386                                       Register n_tmp2, Register n_tmp3) {
10387   if (is_pclmulqdq_supported) {
10388     movdl(w_xtmp1, in_out); // modified blindly
10389 
10390     movl(tmp1, const_or_pre_comp_const_index);
10391     movdl(w_xtmp2, tmp1);
10392     pclmulqdq(w_xtmp1, w_xtmp2, 0);
10393 
10394     movdq(in_out, w_xtmp1);
10395   } else {
10396     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
10397   }
10398 }
10399 
10400 // Recombination Alternative 2: No bit-reflections
10401 // T1 = (CRC_A * U1) << 1
10402 // T2 = (CRC_B * U2) << 1
10403 // C1 = T1 >> 32
10404 // C2 = T2 >> 32
10405 // T1 = T1 & 0xFFFFFFFF
10406 // T2 = T2 & 0xFFFFFFFF
10407 // T1 = CRC32(0, T1)
10408 // T2 = CRC32(0, T2)
10409 // C1 = C1 ^ T1
10410 // C2 = C2 ^ T2
10411 // CRC = C1 ^ C2 ^ CRC_C
10412 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
10413                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10414                                      Register tmp1, Register tmp2,
10415                                      Register n_tmp3) {
10416   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
10417   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
10418   shlq(in_out, 1);
10419   movl(tmp1, in_out);
10420   shrq(in_out, 32);
10421   xorl(tmp2, tmp2);
10422   crc32(tmp2, tmp1, 4);
10423   xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
10424   shlq(in1, 1);
10425   movl(tmp1, in1);
10426   shrq(in1, 32);
10427   xorl(tmp2, tmp2);
10428   crc32(tmp2, tmp1, 4);
10429   xorl(in1, tmp2);
10430   xorl(in_out, in1);
10431   xorl(in_out, in2);
10432 }
10433 
10434 // Set N to predefined value
10435 // Subtract from a lenght of a buffer
10436 // execute in a loop:
10437 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
10438 // for i = 1 to N do
10439 //  CRC_A = CRC32(CRC_A, A[i])
10440 //  CRC_B = CRC32(CRC_B, B[i])
10441 //  CRC_C = CRC32(CRC_C, C[i])
10442 // end for
10443 // Recombine
10444 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
10445                                        Register in_out1, Register in_out2, Register in_out3,
10446                                        Register tmp1, Register tmp2, Register tmp3,
10447                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10448                                        Register tmp4, Register tmp5,
10449                                        Register n_tmp6) {
10450   Label L_processPartitions;
10451   Label L_processPartition;
10452   Label L_exit;
10453 
10454   bind(L_processPartitions);
10455   cmpl(in_out1, 3 * size);
10456   jcc(Assembler::less, L_exit);
10457     xorl(tmp1, tmp1);
10458     xorl(tmp2, tmp2);
10459     movq(tmp3, in_out2);
10460     addq(tmp3, size);
10461 
10462     bind(L_processPartition);
10463       crc32(in_out3, Address(in_out2, 0), 8);
10464       crc32(tmp1, Address(in_out2, size), 8);
10465       crc32(tmp2, Address(in_out2, size * 2), 8);
10466       addq(in_out2, 8);
10467       cmpq(in_out2, tmp3);
10468       jcc(Assembler::less, L_processPartition);
10469     crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
10470             w_xtmp1, w_xtmp2, w_xtmp3,
10471             tmp4, tmp5,
10472             n_tmp6);
10473     addq(in_out2, 2 * size);
10474     subl(in_out1, 3 * size);
10475     jmp(L_processPartitions);
10476 
10477   bind(L_exit);
10478 }
10479 #else
10480 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
10481                                      Register tmp1, Register tmp2, Register tmp3,
10482                                      XMMRegister xtmp1, XMMRegister xtmp2) {
10483   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
10484   if (n > 0) {
10485     addl(tmp3, n * 256 * 8);
10486   }
10487   //    Q1 = TABLEExt[n][B & 0xFF];
10488   movl(tmp1, in_out);
10489   andl(tmp1, 0x000000FF);
10490   shll(tmp1, 3);
10491   addl(tmp1, tmp3);
10492   movq(xtmp1, Address(tmp1, 0));
10493 
10494   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
10495   movl(tmp2, in_out);
10496   shrl(tmp2, 8);
10497   andl(tmp2, 0x000000FF);
10498   shll(tmp2, 3);
10499   addl(tmp2, tmp3);
10500   movq(xtmp2, Address(tmp2, 0));
10501 
10502   psllq(xtmp2, 8);
10503   pxor(xtmp1, xtmp2);
10504 
10505   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
10506   movl(tmp2, in_out);
10507   shrl(tmp2, 16);
10508   andl(tmp2, 0x000000FF);
10509   shll(tmp2, 3);
10510   addl(tmp2, tmp3);
10511   movq(xtmp2, Address(tmp2, 0));
10512 
10513   psllq(xtmp2, 16);
10514   pxor(xtmp1, xtmp2);
10515 
10516   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
10517   shrl(in_out, 24);
10518   andl(in_out, 0x000000FF);
10519   shll(in_out, 3);
10520   addl(in_out, tmp3);
10521   movq(xtmp2, Address(in_out, 0));
10522 
10523   psllq(xtmp2, 24);
10524   pxor(xtmp1, xtmp2); // Result in CXMM
10525   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
10526 }
10527 
10528 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
10529                                       Register in_out,
10530                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
10531                                       XMMRegister w_xtmp2,
10532                                       Register tmp1,
10533                                       Register n_tmp2, Register n_tmp3) {
10534   if (is_pclmulqdq_supported) {
10535     movdl(w_xtmp1, in_out);
10536 
10537     movl(tmp1, const_or_pre_comp_const_index);
10538     movdl(w_xtmp2, tmp1);
10539     pclmulqdq(w_xtmp1, w_xtmp2, 0);
10540     // Keep result in XMM since GPR is 32 bit in length
10541   } else {
10542     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
10543   }
10544 }
10545 
10546 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
10547                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10548                                      Register tmp1, Register tmp2,
10549                                      Register n_tmp3) {
10550   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
10551   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
10552 
10553   psllq(w_xtmp1, 1);
10554   movdl(tmp1, w_xtmp1);
10555   psrlq(w_xtmp1, 32);
10556   movdl(in_out, w_xtmp1);
10557 
10558   xorl(tmp2, tmp2);
10559   crc32(tmp2, tmp1, 4);
10560   xorl(in_out, tmp2);
10561 
10562   psllq(w_xtmp2, 1);
10563   movdl(tmp1, w_xtmp2);
10564   psrlq(w_xtmp2, 32);
10565   movdl(in1, w_xtmp2);
10566 
10567   xorl(tmp2, tmp2);
10568   crc32(tmp2, tmp1, 4);
10569   xorl(in1, tmp2);
10570   xorl(in_out, in1);
10571   xorl(in_out, in2);
10572 }
10573 
10574 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
10575                                        Register in_out1, Register in_out2, Register in_out3,
10576                                        Register tmp1, Register tmp2, Register tmp3,
10577                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10578                                        Register tmp4, Register tmp5,
10579                                        Register n_tmp6) {
10580   Label L_processPartitions;
10581   Label L_processPartition;
10582   Label L_exit;
10583 
10584   bind(L_processPartitions);
10585   cmpl(in_out1, 3 * size);
10586   jcc(Assembler::less, L_exit);
10587     xorl(tmp1, tmp1);
10588     xorl(tmp2, tmp2);
10589     movl(tmp3, in_out2);
10590     addl(tmp3, size);
10591 
10592     bind(L_processPartition);
10593       crc32(in_out3, Address(in_out2, 0), 4);
10594       crc32(tmp1, Address(in_out2, size), 4);
10595       crc32(tmp2, Address(in_out2, size*2), 4);
10596       crc32(in_out3, Address(in_out2, 0+4), 4);
10597       crc32(tmp1, Address(in_out2, size+4), 4);
10598       crc32(tmp2, Address(in_out2, size*2+4), 4);
10599       addl(in_out2, 8);
10600       cmpl(in_out2, tmp3);
10601       jcc(Assembler::less, L_processPartition);
10602 
10603         push(tmp3);
10604         push(in_out1);
10605         push(in_out2);
10606         tmp4 = tmp3;
10607         tmp5 = in_out1;
10608         n_tmp6 = in_out2;
10609 
10610       crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
10611             w_xtmp1, w_xtmp2, w_xtmp3,
10612             tmp4, tmp5,
10613             n_tmp6);
10614 
10615         pop(in_out2);
10616         pop(in_out1);
10617         pop(tmp3);
10618 
10619     addl(in_out2, 2 * size);
10620     subl(in_out1, 3 * size);
10621     jmp(L_processPartitions);
10622 
10623   bind(L_exit);
10624 }
10625 #endif //LP64
10626 
10627 #ifdef _LP64
10628 // Algorithm 2: Pipelined usage of the CRC32 instruction.
10629 // Input: A buffer I of L bytes.
10630 // Output: the CRC32C value of the buffer.
10631 // Notations:
10632 // Write L = 24N + r, with N = floor (L/24).
10633 // r = L mod 24 (0 <= r < 24).
10634 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
10635 // N quadwords, and R consists of r bytes.
10636 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
10637 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
10638 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
10639 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
10640 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
10641                                           Register tmp1, Register tmp2, Register tmp3,
10642                                           Register tmp4, Register tmp5, Register tmp6,
10643                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10644                                           bool is_pclmulqdq_supported) {
10645   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
10646   Label L_wordByWord;
10647   Label L_byteByByteProlog;
10648   Label L_byteByByte;
10649   Label L_exit;
10650 
10651   if (is_pclmulqdq_supported ) {
10652     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
10653     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
10654 
10655     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
10656     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
10657 
10658     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
10659     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
10660     assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
10661   } else {
10662     const_or_pre_comp_const_index[0] = 1;
10663     const_or_pre_comp_const_index[1] = 0;
10664 
10665     const_or_pre_comp_const_index[2] = 3;
10666     const_or_pre_comp_const_index[3] = 2;
10667 
10668     const_or_pre_comp_const_index[4] = 5;
10669     const_or_pre_comp_const_index[5] = 4;
10670    }
10671   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
10672                     in2, in1, in_out,
10673                     tmp1, tmp2, tmp3,
10674                     w_xtmp1, w_xtmp2, w_xtmp3,
10675                     tmp4, tmp5,
10676                     tmp6);
10677   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
10678                     in2, in1, in_out,
10679                     tmp1, tmp2, tmp3,
10680                     w_xtmp1, w_xtmp2, w_xtmp3,
10681                     tmp4, tmp5,
10682                     tmp6);
10683   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
10684                     in2, in1, in_out,
10685                     tmp1, tmp2, tmp3,
10686                     w_xtmp1, w_xtmp2, w_xtmp3,
10687                     tmp4, tmp5,
10688                     tmp6);
10689   movl(tmp1, in2);
10690   andl(tmp1, 0x00000007);
10691   negl(tmp1);
10692   addl(tmp1, in2);
10693   addq(tmp1, in1);
10694 
10695   BIND(L_wordByWord);
10696   cmpq(in1, tmp1);
10697   jcc(Assembler::greaterEqual, L_byteByByteProlog);
10698     crc32(in_out, Address(in1, 0), 4);
10699     addq(in1, 4);
10700     jmp(L_wordByWord);
10701 
10702   BIND(L_byteByByteProlog);
10703   andl(in2, 0x00000007);
10704   movl(tmp2, 1);
10705 
10706   BIND(L_byteByByte);
10707   cmpl(tmp2, in2);
10708   jccb(Assembler::greater, L_exit);
10709     crc32(in_out, Address(in1, 0), 1);
10710     incq(in1);
10711     incl(tmp2);
10712     jmp(L_byteByByte);
10713 
10714   BIND(L_exit);
10715 }
10716 #else
10717 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
10718                                           Register tmp1, Register  tmp2, Register tmp3,
10719                                           Register tmp4, Register  tmp5, Register tmp6,
10720                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10721                                           bool is_pclmulqdq_supported) {
10722   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
10723   Label L_wordByWord;
10724   Label L_byteByByteProlog;
10725   Label L_byteByByte;
10726   Label L_exit;
10727 
10728   if (is_pclmulqdq_supported) {
10729     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
10730     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
10731 
10732     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
10733     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
10734 
10735     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
10736     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
10737   } else {
10738     const_or_pre_comp_const_index[0] = 1;
10739     const_or_pre_comp_const_index[1] = 0;
10740 
10741     const_or_pre_comp_const_index[2] = 3;
10742     const_or_pre_comp_const_index[3] = 2;
10743 
10744     const_or_pre_comp_const_index[4] = 5;
10745     const_or_pre_comp_const_index[5] = 4;
10746   }
10747   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
10748                     in2, in1, in_out,
10749                     tmp1, tmp2, tmp3,
10750                     w_xtmp1, w_xtmp2, w_xtmp3,
10751                     tmp4, tmp5,
10752                     tmp6);
10753   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
10754                     in2, in1, in_out,
10755                     tmp1, tmp2, tmp3,
10756                     w_xtmp1, w_xtmp2, w_xtmp3,
10757                     tmp4, tmp5,
10758                     tmp6);
10759   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
10760                     in2, in1, in_out,
10761                     tmp1, tmp2, tmp3,
10762                     w_xtmp1, w_xtmp2, w_xtmp3,
10763                     tmp4, tmp5,
10764                     tmp6);
10765   movl(tmp1, in2);
10766   andl(tmp1, 0x00000007);
10767   negl(tmp1);
10768   addl(tmp1, in2);
10769   addl(tmp1, in1);
10770 
10771   BIND(L_wordByWord);
10772   cmpl(in1, tmp1);
10773   jcc(Assembler::greaterEqual, L_byteByByteProlog);
10774     crc32(in_out, Address(in1,0), 4);
10775     addl(in1, 4);
10776     jmp(L_wordByWord);
10777 
10778   BIND(L_byteByByteProlog);
10779   andl(in2, 0x00000007);
10780   movl(tmp2, 1);
10781 
10782   BIND(L_byteByByte);
10783   cmpl(tmp2, in2);
10784   jccb(Assembler::greater, L_exit);
10785     movb(tmp1, Address(in1, 0));
10786     crc32(in_out, tmp1, 1);
10787     incl(in1);
10788     incl(tmp2);
10789     jmp(L_byteByByte);
10790 
10791   BIND(L_exit);
10792 }
10793 #endif // LP64
10794 #undef BIND
10795 #undef BLOCK_COMMENT
10796 
10797 // Compress char[] array to byte[].
10798 //   ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
10799 //   @HotSpotIntrinsicCandidate
10800 //   private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
10801 //     for (int i = 0; i < len; i++) {
10802 //       int c = src[srcOff++];
10803 //       if (c >>> 8 != 0) {
10804 //         return 0;
10805 //       }
10806 //       dst[dstOff++] = (byte)c;
10807 //     }
10808 //     return len;
10809 //   }
10810 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
10811   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
10812   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
10813   Register tmp5, Register result) {
10814   Label copy_chars_loop, return_length, return_zero, done, below_threshold;
10815 
10816   // rsi: src
10817   // rdi: dst
10818   // rdx: len
10819   // rcx: tmp5
10820   // rax: result
10821 
10822   // rsi holds start addr of source char[] to be compressed
10823   // rdi holds start addr of destination byte[]
10824   // rdx holds length
10825 
10826   assert(len != result, "");
10827 
10828   // save length for return
10829   push(len);
10830 
10831   if ((UseAVX > 2) && // AVX512
10832     VM_Version::supports_avx512vlbw() &&
10833     VM_Version::supports_bmi2()) {
10834 
10835     set_vector_masking();  // opening of the stub context for programming mask registers
10836 
10837     Label copy_32_loop, copy_loop_tail, restore_k1_return_zero;
10838 
10839     // alignement
10840     Label post_alignement;
10841 
10842     // if length of the string is less than 16, handle it in an old fashioned
10843     // way
10844     testl(len, -32);
10845     jcc(Assembler::zero, below_threshold);
10846 
10847     // First check whether a character is compressable ( <= 0xFF).
10848     // Create mask to test for Unicode chars inside zmm vector
10849     movl(result, 0x00FF);
10850     evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
10851 
10852     // Save k1
10853     kmovql(k3, k1);
10854 
10855     testl(len, -64);
10856     jcc(Assembler::zero, post_alignement);
10857 
10858     movl(tmp5, dst);
10859     andl(tmp5, (32 - 1));
10860     negl(tmp5);
10861     andl(tmp5, (32 - 1));
10862 
10863     // bail out when there is nothing to be done
10864     testl(tmp5, 0xFFFFFFFF);
10865     jcc(Assembler::zero, post_alignement);
10866 
10867     // ~(~0 << len), where len is the # of remaining elements to process
10868     movl(result, 0xFFFFFFFF);
10869     shlxl(result, result, tmp5);
10870     notl(result);
10871     kmovdl(k1, result);
10872 
10873     evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit);
10874     evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
10875     ktestd(k2, k1);
10876     jcc(Assembler::carryClear, restore_k1_return_zero);
10877 
10878     evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
10879 
10880     addptr(src, tmp5);
10881     addptr(src, tmp5);
10882     addptr(dst, tmp5);
10883     subl(len, tmp5);
10884 
10885     bind(post_alignement);
10886     // end of alignement
10887 
10888     movl(tmp5, len);
10889     andl(tmp5, (32 - 1));    // tail count (in chars)
10890     andl(len, ~(32 - 1));    // vector count (in chars)
10891     jcc(Assembler::zero, copy_loop_tail);
10892 
10893     lea(src, Address(src, len, Address::times_2));
10894     lea(dst, Address(dst, len, Address::times_1));
10895     negptr(len);
10896 
10897     bind(copy_32_loop);
10898     evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
10899     evpcmpuw(k2, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
10900     kortestdl(k2, k2);
10901     jcc(Assembler::carryClear, restore_k1_return_zero);
10902 
10903     // All elements in current processed chunk are valid candidates for
10904     // compression. Write a truncated byte elements to the memory.
10905     evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
10906     addptr(len, 32);
10907     jcc(Assembler::notZero, copy_32_loop);
10908 
10909     bind(copy_loop_tail);
10910     // bail out when there is nothing to be done
10911     testl(tmp5, 0xFFFFFFFF);
10912     // Restore k1
10913     kmovql(k1, k3);
10914     jcc(Assembler::zero, return_length);
10915 
10916     movl(len, tmp5);
10917 
10918     // ~(~0 << len), where len is the # of remaining elements to process
10919     movl(result, 0xFFFFFFFF);
10920     shlxl(result, result, len);
10921     notl(result);
10922 
10923     kmovdl(k1, result);
10924 
10925     evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit);
10926     evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
10927     ktestd(k2, k1);
10928     jcc(Assembler::carryClear, restore_k1_return_zero);
10929 
10930     evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
10931     // Restore k1
10932     kmovql(k1, k3);
10933     jmp(return_length);
10934 
10935     bind(restore_k1_return_zero);
10936     // Restore k1
10937     kmovql(k1, k3);
10938     jmp(return_zero);
10939 
10940     clear_vector_masking();   // closing of the stub context for programming mask registers
10941   }
10942   if (UseSSE42Intrinsics) {
10943     Label copy_32_loop, copy_16, copy_tail;
10944 
10945     bind(below_threshold);
10946 
10947     movl(result, len);
10948 
10949     movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
10950 
10951     // vectored compression
10952     andl(len, 0xfffffff0);    // vector count (in chars)
10953     andl(result, 0x0000000f);    // tail count (in chars)
10954     testl(len, len);
10955     jccb(Assembler::zero, copy_16);
10956 
10957     // compress 16 chars per iter
10958     movdl(tmp1Reg, tmp5);
10959     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
10960     pxor(tmp4Reg, tmp4Reg);
10961 
10962     lea(src, Address(src, len, Address::times_2));
10963     lea(dst, Address(dst, len, Address::times_1));
10964     negptr(len);
10965 
10966     bind(copy_32_loop);
10967     movdqu(tmp2Reg, Address(src, len, Address::times_2));     // load 1st 8 characters
10968     por(tmp4Reg, tmp2Reg);
10969     movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
10970     por(tmp4Reg, tmp3Reg);
10971     ptest(tmp4Reg, tmp1Reg);       // check for Unicode chars in next vector
10972     jcc(Assembler::notZero, return_zero);
10973     packuswb(tmp2Reg, tmp3Reg);    // only ASCII chars; compress each to 1 byte
10974     movdqu(Address(dst, len, Address::times_1), tmp2Reg);
10975     addptr(len, 16);
10976     jcc(Assembler::notZero, copy_32_loop);
10977 
10978     // compress next vector of 8 chars (if any)
10979     bind(copy_16);
10980     movl(len, result);
10981     andl(len, 0xfffffff8);    // vector count (in chars)
10982     andl(result, 0x00000007);    // tail count (in chars)
10983     testl(len, len);
10984     jccb(Assembler::zero, copy_tail);
10985 
10986     movdl(tmp1Reg, tmp5);
10987     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
10988     pxor(tmp3Reg, tmp3Reg);
10989 
10990     movdqu(tmp2Reg, Address(src, 0));
10991     ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in vector
10992     jccb(Assembler::notZero, return_zero);
10993     packuswb(tmp2Reg, tmp3Reg);    // only LATIN1 chars; compress each to 1 byte
10994     movq(Address(dst, 0), tmp2Reg);
10995     addptr(src, 16);
10996     addptr(dst, 8);
10997 
10998     bind(copy_tail);
10999     movl(len, result);
11000   }
11001   // compress 1 char per iter
11002   testl(len, len);
11003   jccb(Assembler::zero, return_length);
11004   lea(src, Address(src, len, Address::times_2));
11005   lea(dst, Address(dst, len, Address::times_1));
11006   negptr(len);
11007 
11008   bind(copy_chars_loop);
11009   load_unsigned_short(result, Address(src, len, Address::times_2));
11010   testl(result, 0xff00);      // check if Unicode char
11011   jccb(Assembler::notZero, return_zero);
11012   movb(Address(dst, len, Address::times_1), result);  // ASCII char; compress to 1 byte
11013   increment(len);
11014   jcc(Assembler::notZero, copy_chars_loop);
11015 
11016   // if compression succeeded, return length
11017   bind(return_length);
11018   pop(result);
11019   jmpb(done);
11020 
11021   // if compression failed, return 0
11022   bind(return_zero);
11023   xorl(result, result);
11024   addptr(rsp, wordSize);
11025 
11026   bind(done);
11027 }
11028 
11029 // Inflate byte[] array to char[].
11030 //   ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
11031 //   @HotSpotIntrinsicCandidate
11032 //   private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
11033 //     for (int i = 0; i < len; i++) {
11034 //       dst[dstOff++] = (char)(src[srcOff++] & 0xff);
11035 //     }
11036 //   }
11037 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
11038   XMMRegister tmp1, Register tmp2) {
11039   Label copy_chars_loop, done, below_threshold;
11040   // rsi: src
11041   // rdi: dst
11042   // rdx: len
11043   // rcx: tmp2
11044 
11045   // rsi holds start addr of source byte[] to be inflated
11046   // rdi holds start addr of destination char[]
11047   // rdx holds length
11048   assert_different_registers(src, dst, len, tmp2);
11049 
11050   if ((UseAVX > 2) && // AVX512
11051     VM_Version::supports_avx512vlbw() &&
11052     VM_Version::supports_bmi2()) {
11053 
11054     set_vector_masking();  // opening of the stub context for programming mask registers
11055 
11056     Label copy_32_loop, copy_tail;
11057     Register tmp3_aliased = len;
11058 
11059     // if length of the string is less than 16, handle it in an old fashioned
11060     // way
11061     testl(len, -16);
11062     jcc(Assembler::zero, below_threshold);
11063 
11064     // In order to use only one arithmetic operation for the main loop we use
11065     // this pre-calculation
11066     movl(tmp2, len);
11067     andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
11068     andl(len, -32);     // vector count
11069     jccb(Assembler::zero, copy_tail);
11070 
11071     lea(src, Address(src, len, Address::times_1));
11072     lea(dst, Address(dst, len, Address::times_2));
11073     negptr(len);
11074 
11075 
11076     // inflate 32 chars per iter
11077     bind(copy_32_loop);
11078     vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
11079     evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
11080     addptr(len, 32);
11081     jcc(Assembler::notZero, copy_32_loop);
11082 
11083     bind(copy_tail);
11084     // bail out when there is nothing to be done
11085     testl(tmp2, -1); // we don't destroy the contents of tmp2 here
11086     jcc(Assembler::zero, done);
11087 
11088     // Save k1
11089     kmovql(k2, k1);
11090 
11091     // ~(~0 << length), where length is the # of remaining elements to process
11092     movl(tmp3_aliased, -1);
11093     shlxl(tmp3_aliased, tmp3_aliased, tmp2);
11094     notl(tmp3_aliased);
11095     kmovdl(k1, tmp3_aliased);
11096     evpmovzxbw(tmp1, k1, Address(src, 0), Assembler::AVX_512bit);
11097     evmovdquw(Address(dst, 0), k1, tmp1, Assembler::AVX_512bit);
11098 
11099     // Restore k1
11100     kmovql(k1, k2);
11101     jmp(done);
11102 
11103     clear_vector_masking();   // closing of the stub context for programming mask registers
11104   }
11105   if (UseSSE42Intrinsics) {
11106     Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
11107 
11108     movl(tmp2, len);
11109 
11110     if (UseAVX > 1) {
11111       andl(tmp2, (16 - 1));
11112       andl(len, -16);
11113       jccb(Assembler::zero, copy_new_tail);
11114     } else {
11115       andl(tmp2, 0x00000007);   // tail count (in chars)
11116       andl(len, 0xfffffff8);    // vector count (in chars)
11117       jccb(Assembler::zero, copy_tail);
11118     }
11119 
11120     // vectored inflation
11121     lea(src, Address(src, len, Address::times_1));
11122     lea(dst, Address(dst, len, Address::times_2));
11123     negptr(len);
11124 
11125     if (UseAVX > 1) {
11126       bind(copy_16_loop);
11127       vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
11128       vmovdqu(Address(dst, len, Address::times_2), tmp1);
11129       addptr(len, 16);
11130       jcc(Assembler::notZero, copy_16_loop);
11131 
11132       bind(below_threshold);
11133       bind(copy_new_tail);
11134       if ((UseAVX > 2) &&
11135         VM_Version::supports_avx512vlbw() &&
11136         VM_Version::supports_bmi2()) {
11137         movl(tmp2, len);
11138       } else {
11139         movl(len, tmp2);
11140       }
11141       andl(tmp2, 0x00000007);
11142       andl(len, 0xFFFFFFF8);
11143       jccb(Assembler::zero, copy_tail);
11144 
11145       pmovzxbw(tmp1, Address(src, 0));
11146       movdqu(Address(dst, 0), tmp1);
11147       addptr(src, 8);
11148       addptr(dst, 2 * 8);
11149 
11150       jmp(copy_tail, true);
11151     }
11152 
11153     // inflate 8 chars per iter
11154     bind(copy_8_loop);
11155     pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
11156     movdqu(Address(dst, len, Address::times_2), tmp1);
11157     addptr(len, 8);
11158     jcc(Assembler::notZero, copy_8_loop);
11159 
11160     bind(copy_tail);
11161     movl(len, tmp2);
11162 
11163     cmpl(len, 4);
11164     jccb(Assembler::less, copy_bytes);
11165 
11166     movdl(tmp1, Address(src, 0));  // load 4 byte chars
11167     pmovzxbw(tmp1, tmp1);
11168     movq(Address(dst, 0), tmp1);
11169     subptr(len, 4);
11170     addptr(src, 4);
11171     addptr(dst, 8);
11172 
11173     bind(copy_bytes);
11174   }
11175   testl(len, len);
11176   jccb(Assembler::zero, done);
11177   lea(src, Address(src, len, Address::times_1));
11178   lea(dst, Address(dst, len, Address::times_2));
11179   negptr(len);
11180 
11181   // inflate 1 char per iter
11182   bind(copy_chars_loop);
11183   load_unsigned_byte(tmp2, Address(src, len, Address::times_1));  // load byte char
11184   movw(Address(dst, len, Address::times_2), tmp2);  // inflate byte char to word
11185   increment(len);
11186   jcc(Assembler::notZero, copy_chars_loop);
11187 
11188   bind(done);
11189 }
11190 
11191 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
11192   switch (cond) {
11193     // Note some conditions are synonyms for others
11194     case Assembler::zero:         return Assembler::notZero;
11195     case Assembler::notZero:      return Assembler::zero;
11196     case Assembler::less:         return Assembler::greaterEqual;
11197     case Assembler::lessEqual:    return Assembler::greater;
11198     case Assembler::greater:      return Assembler::lessEqual;
11199     case Assembler::greaterEqual: return Assembler::less;
11200     case Assembler::below:        return Assembler::aboveEqual;
11201     case Assembler::belowEqual:   return Assembler::above;
11202     case Assembler::above:        return Assembler::belowEqual;
11203     case Assembler::aboveEqual:   return Assembler::below;
11204     case Assembler::overflow:     return Assembler::noOverflow;
11205     case Assembler::noOverflow:   return Assembler::overflow;
11206     case Assembler::negative:     return Assembler::positive;
11207     case Assembler::positive:     return Assembler::negative;
11208     case Assembler::parity:       return Assembler::noParity;
11209     case Assembler::noParity:     return Assembler::parity;
11210   }
11211   ShouldNotReachHere(); return Assembler::overflow;
11212 }
11213 
11214 SkipIfEqual::SkipIfEqual(
11215     MacroAssembler* masm, const bool* flag_addr, bool value) {
11216   _masm = masm;
11217   _masm->cmp8(ExternalAddress((address)flag_addr), value);
11218   _masm->jcc(Assembler::equal, _label);
11219 }
11220 
11221 SkipIfEqual::~SkipIfEqual() {
11222   _masm->bind(_label);
11223 }
11224 
11225 // 32-bit Windows has its own fast-path implementation
11226 // of get_thread
11227 #if !defined(WIN32) || defined(_LP64)
11228 
11229 // This is simply a call to Thread::current()
11230 void MacroAssembler::get_thread(Register thread) {
11231   if (thread != rax) {
11232     push(rax);
11233   }
11234   LP64_ONLY(push(rdi);)
11235   LP64_ONLY(push(rsi);)
11236   push(rdx);
11237   push(rcx);
11238 #ifdef _LP64
11239   push(r8);
11240   push(r9);
11241   push(r10);
11242   push(r11);
11243 #endif
11244 
11245   MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
11246 
11247 #ifdef _LP64
11248   pop(r11);
11249   pop(r10);
11250   pop(r9);
11251   pop(r8);
11252 #endif
11253   pop(rcx);
11254   pop(rdx);
11255   LP64_ONLY(pop(rsi);)
11256   LP64_ONLY(pop(rdi);)
11257   if (thread != rax) {
11258     mov(thread, rax);
11259     pop(rax);
11260   }
11261 }
11262 
11263 #endif