1 /*
   2  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "jvm.h"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "compiler/disassembler.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "gc/shared/collectedHeap.inline.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 #include "memory/resourceArea.hpp"
  35 #include "memory/universe.hpp"
  36 #include "oops/accessDecorators.hpp"
  37 #include "oops/klass.inline.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/biasedLocking.hpp"
  40 #include "runtime/flags/flagSetting.hpp"
  41 #include "runtime/interfaceSupport.inline.hpp"
  42 #include "runtime/objectMonitor.hpp"
  43 #include "runtime/os.hpp"
  44 #include "runtime/safepoint.hpp"
  45 #include "runtime/safepointMechanism.hpp"
  46 #include "runtime/sharedRuntime.hpp"
  47 #include "runtime/stubRoutines.hpp"
  48 #include "runtime/thread.hpp"
  49 #include "utilities/macros.hpp"
  50 #include "crc32c.h"
  51 #ifdef COMPILER2
  52 #include "opto/intrinsicnode.hpp"
  53 #endif
  54 
  55 #ifdef PRODUCT
  56 #define BLOCK_COMMENT(str) /* nothing */
  57 #define STOP(error) stop(error)
  58 #else
  59 #define BLOCK_COMMENT(str) block_comment(str)
  60 #define STOP(error) block_comment(error); stop(error)
  61 #endif
  62 
  63 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  64 
  65 #ifdef ASSERT
  66 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
  67 #endif
  68 
  69 static Assembler::Condition reverse[] = {
  70     Assembler::noOverflow     /* overflow      = 0x0 */ ,
  71     Assembler::overflow       /* noOverflow    = 0x1 */ ,
  72     Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
  73     Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
  74     Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
  75     Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
  76     Assembler::above          /* belowEqual    = 0x6 */ ,
  77     Assembler::belowEqual     /* above         = 0x7 */ ,
  78     Assembler::positive       /* negative      = 0x8 */ ,
  79     Assembler::negative       /* positive      = 0x9 */ ,
  80     Assembler::noParity       /* parity        = 0xa */ ,
  81     Assembler::parity         /* noParity      = 0xb */ ,
  82     Assembler::greaterEqual   /* less          = 0xc */ ,
  83     Assembler::less           /* greaterEqual  = 0xd */ ,
  84     Assembler::greater        /* lessEqual     = 0xe */ ,
  85     Assembler::lessEqual      /* greater       = 0xf, */
  86 
  87 };
  88 
  89 
  90 // Implementation of MacroAssembler
  91 
  92 // First all the versions that have distinct versions depending on 32/64 bit
  93 // Unless the difference is trivial (1 line or so).
  94 
  95 #ifndef _LP64
  96 
  97 // 32bit versions
  98 
  99 Address MacroAssembler::as_Address(AddressLiteral adr) {
 100   return Address(adr.target(), adr.rspec());
 101 }
 102 
 103 Address MacroAssembler::as_Address(ArrayAddress adr) {
 104   return Address::make_array(adr);
 105 }
 106 
 107 void MacroAssembler::call_VM_leaf_base(address entry_point,
 108                                        int number_of_arguments) {
 109   call(RuntimeAddress(entry_point));
 110   increment(rsp, number_of_arguments * wordSize);
 111 }
 112 
 113 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
 114   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 115 }
 116 
 117 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
 118   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 119 }
 120 
 121 void MacroAssembler::cmpoop_raw(Address src1, jobject obj) {
 122   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 123 }
 124 
 125 void MacroAssembler::cmpoop_raw(Register src1, jobject obj) {
 126   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 127 }
 128 
 129 void MacroAssembler::cmpoop(Address src1, jobject obj) {
 130   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 131   bs->obj_equals(this, src1, obj);
 132 }
 133 
 134 void MacroAssembler::cmpoop(Register src1, jobject obj) {
 135   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 136   bs->obj_equals(this, src1, obj);
 137 }
 138 
 139 void MacroAssembler::extend_sign(Register hi, Register lo) {
 140   // According to Intel Doc. AP-526, "Integer Divide", p.18.
 141   if (VM_Version::is_P6() && hi == rdx && lo == rax) {
 142     cdql();
 143   } else {
 144     movl(hi, lo);
 145     sarl(hi, 31);
 146   }
 147 }
 148 
 149 void MacroAssembler::jC2(Register tmp, Label& L) {
 150   // set parity bit if FPU flag C2 is set (via rax)
 151   save_rax(tmp);
 152   fwait(); fnstsw_ax();
 153   sahf();
 154   restore_rax(tmp);
 155   // branch
 156   jcc(Assembler::parity, L);
 157 }
 158 
 159 void MacroAssembler::jnC2(Register tmp, Label& L) {
 160   // set parity bit if FPU flag C2 is set (via rax)
 161   save_rax(tmp);
 162   fwait(); fnstsw_ax();
 163   sahf();
 164   restore_rax(tmp);
 165   // branch
 166   jcc(Assembler::noParity, L);
 167 }
 168 
 169 // 32bit can do a case table jump in one instruction but we no longer allow the base
 170 // to be installed in the Address class
 171 void MacroAssembler::jump(ArrayAddress entry) {
 172   jmp(as_Address(entry));
 173 }
 174 
 175 // Note: y_lo will be destroyed
 176 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 177   // Long compare for Java (semantics as described in JVM spec.)
 178   Label high, low, done;
 179 
 180   cmpl(x_hi, y_hi);
 181   jcc(Assembler::less, low);
 182   jcc(Assembler::greater, high);
 183   // x_hi is the return register
 184   xorl(x_hi, x_hi);
 185   cmpl(x_lo, y_lo);
 186   jcc(Assembler::below, low);
 187   jcc(Assembler::equal, done);
 188 
 189   bind(high);
 190   xorl(x_hi, x_hi);
 191   increment(x_hi);
 192   jmp(done);
 193 
 194   bind(low);
 195   xorl(x_hi, x_hi);
 196   decrementl(x_hi);
 197 
 198   bind(done);
 199 }
 200 
 201 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 202     mov_literal32(dst, (int32_t)src.target(), src.rspec());
 203 }
 204 
 205 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 206   // leal(dst, as_Address(adr));
 207   // see note in movl as to why we must use a move
 208   mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
 209 }
 210 
 211 void MacroAssembler::leave() {
 212   mov(rsp, rbp);
 213   pop(rbp);
 214 }
 215 
 216 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
 217   // Multiplication of two Java long values stored on the stack
 218   // as illustrated below. Result is in rdx:rax.
 219   //
 220   // rsp ---> [  ??  ] \               \
 221   //            ....    | y_rsp_offset  |
 222   //          [ y_lo ] /  (in bytes)    | x_rsp_offset
 223   //          [ y_hi ]                  | (in bytes)
 224   //            ....                    |
 225   //          [ x_lo ]                 /
 226   //          [ x_hi ]
 227   //            ....
 228   //
 229   // Basic idea: lo(result) = lo(x_lo * y_lo)
 230   //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
 231   Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
 232   Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
 233   Label quick;
 234   // load x_hi, y_hi and check if quick
 235   // multiplication is possible
 236   movl(rbx, x_hi);
 237   movl(rcx, y_hi);
 238   movl(rax, rbx);
 239   orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
 240   jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
 241   // do full multiplication
 242   // 1st step
 243   mull(y_lo);                                    // x_hi * y_lo
 244   movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
 245   // 2nd step
 246   movl(rax, x_lo);
 247   mull(rcx);                                     // x_lo * y_hi
 248   addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
 249   // 3rd step
 250   bind(quick);                                   // note: rbx, = 0 if quick multiply!
 251   movl(rax, x_lo);
 252   mull(y_lo);                                    // x_lo * y_lo
 253   addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
 254 }
 255 
 256 void MacroAssembler::lneg(Register hi, Register lo) {
 257   negl(lo);
 258   adcl(hi, 0);
 259   negl(hi);
 260 }
 261 
 262 void MacroAssembler::lshl(Register hi, Register lo) {
 263   // Java shift left long support (semantics as described in JVM spec., p.305)
 264   // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
 265   // shift value is in rcx !
 266   assert(hi != rcx, "must not use rcx");
 267   assert(lo != rcx, "must not use rcx");
 268   const Register s = rcx;                        // shift count
 269   const int      n = BitsPerWord;
 270   Label L;
 271   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 272   cmpl(s, n);                                    // if (s < n)
 273   jcc(Assembler::less, L);                       // else (s >= n)
 274   movl(hi, lo);                                  // x := x << n
 275   xorl(lo, lo);
 276   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 277   bind(L);                                       // s (mod n) < n
 278   shldl(hi, lo);                                 // x := x << s
 279   shll(lo);
 280 }
 281 
 282 
 283 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
 284   // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
 285   // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
 286   assert(hi != rcx, "must not use rcx");
 287   assert(lo != rcx, "must not use rcx");
 288   const Register s = rcx;                        // shift count
 289   const int      n = BitsPerWord;
 290   Label L;
 291   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 292   cmpl(s, n);                                    // if (s < n)
 293   jcc(Assembler::less, L);                       // else (s >= n)
 294   movl(lo, hi);                                  // x := x >> n
 295   if (sign_extension) sarl(hi, 31);
 296   else                xorl(hi, hi);
 297   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 298   bind(L);                                       // s (mod n) < n
 299   shrdl(lo, hi);                                 // x := x >> s
 300   if (sign_extension) sarl(hi);
 301   else                shrl(hi);
 302 }
 303 
 304 void MacroAssembler::movoop(Register dst, jobject obj) {
 305   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 306 }
 307 
 308 void MacroAssembler::movoop(Address dst, jobject obj) {
 309   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 310 }
 311 
 312 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 313   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 314 }
 315 
 316 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 317   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 318 }
 319 
 320 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 321   // scratch register is not used,
 322   // it is defined to match parameters of 64-bit version of this method.
 323   if (src.is_lval()) {
 324     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
 325   } else {
 326     movl(dst, as_Address(src));
 327   }
 328 }
 329 
 330 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 331   movl(as_Address(dst), src);
 332 }
 333 
 334 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 335   movl(dst, as_Address(src));
 336 }
 337 
 338 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 339 void MacroAssembler::movptr(Address dst, intptr_t src) {
 340   movl(dst, src);
 341 }
 342 
 343 
 344 void MacroAssembler::pop_callee_saved_registers() {
 345   pop(rcx);
 346   pop(rdx);
 347   pop(rdi);
 348   pop(rsi);
 349 }
 350 
 351 void MacroAssembler::pop_fTOS() {
 352   fld_d(Address(rsp, 0));
 353   addl(rsp, 2 * wordSize);
 354 }
 355 
 356 void MacroAssembler::push_callee_saved_registers() {
 357   push(rsi);
 358   push(rdi);
 359   push(rdx);
 360   push(rcx);
 361 }
 362 
 363 void MacroAssembler::push_fTOS() {
 364   subl(rsp, 2 * wordSize);
 365   fstp_d(Address(rsp, 0));
 366 }
 367 
 368 
 369 void MacroAssembler::pushoop(jobject obj) {
 370   push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
 371 }
 372 
 373 void MacroAssembler::pushklass(Metadata* obj) {
 374   push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
 375 }
 376 
 377 void MacroAssembler::pushptr(AddressLiteral src) {
 378   if (src.is_lval()) {
 379     push_literal32((int32_t)src.target(), src.rspec());
 380   } else {
 381     pushl(as_Address(src));
 382   }
 383 }
 384 
 385 void MacroAssembler::set_word_if_not_zero(Register dst) {
 386   xorl(dst, dst);
 387   set_byte_if_not_zero(dst);
 388 }
 389 
 390 static void pass_arg0(MacroAssembler* masm, Register arg) {
 391   masm->push(arg);
 392 }
 393 
 394 static void pass_arg1(MacroAssembler* masm, Register arg) {
 395   masm->push(arg);
 396 }
 397 
 398 static void pass_arg2(MacroAssembler* masm, Register arg) {
 399   masm->push(arg);
 400 }
 401 
 402 static void pass_arg3(MacroAssembler* masm, Register arg) {
 403   masm->push(arg);
 404 }
 405 
 406 #ifndef PRODUCT
 407 extern "C" void findpc(intptr_t x);
 408 #endif
 409 
 410 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
 411   // In order to get locks to work, we need to fake a in_VM state
 412   JavaThread* thread = JavaThread::current();
 413   JavaThreadState saved_state = thread->thread_state();
 414   thread->set_thread_state(_thread_in_vm);
 415   if (ShowMessageBoxOnError) {
 416     JavaThread* thread = JavaThread::current();
 417     JavaThreadState saved_state = thread->thread_state();
 418     thread->set_thread_state(_thread_in_vm);
 419     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 420       ttyLocker ttyl;
 421       BytecodeCounter::print();
 422     }
 423     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 424     // This is the value of eip which points to where verify_oop will return.
 425     if (os::message_box(msg, "Execution stopped, print registers?")) {
 426       print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
 427       BREAKPOINT;
 428     }
 429   } else {
 430     ttyLocker ttyl;
 431     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
 432   }
 433   // Don't assert holding the ttyLock
 434     assert(false, "DEBUG MESSAGE: %s", msg);
 435   ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
 436 }
 437 
 438 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
 439   ttyLocker ttyl;
 440   FlagSetting fs(Debugging, true);
 441   tty->print_cr("eip = 0x%08x", eip);
 442 #ifndef PRODUCT
 443   if ((WizardMode || Verbose) && PrintMiscellaneous) {
 444     tty->cr();
 445     findpc(eip);
 446     tty->cr();
 447   }
 448 #endif
 449 #define PRINT_REG(rax) \
 450   { tty->print("%s = ", #rax); os::print_location(tty, rax); }
 451   PRINT_REG(rax);
 452   PRINT_REG(rbx);
 453   PRINT_REG(rcx);
 454   PRINT_REG(rdx);
 455   PRINT_REG(rdi);
 456   PRINT_REG(rsi);
 457   PRINT_REG(rbp);
 458   PRINT_REG(rsp);
 459 #undef PRINT_REG
 460   // Print some words near top of staack.
 461   int* dump_sp = (int*) rsp;
 462   for (int col1 = 0; col1 < 8; col1++) {
 463     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 464     os::print_location(tty, *dump_sp++);
 465   }
 466   for (int row = 0; row < 16; row++) {
 467     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 468     for (int col = 0; col < 8; col++) {
 469       tty->print(" 0x%08x", *dump_sp++);
 470     }
 471     tty->cr();
 472   }
 473   // Print some instructions around pc:
 474   Disassembler::decode((address)eip-64, (address)eip);
 475   tty->print_cr("--------");
 476   Disassembler::decode((address)eip, (address)eip+32);
 477 }
 478 
 479 void MacroAssembler::stop(const char* msg) {
 480   ExternalAddress message((address)msg);
 481   // push address of message
 482   pushptr(message.addr());
 483   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 484   pusha();                                            // push registers
 485   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
 486   hlt();
 487 }
 488 
 489 void MacroAssembler::warn(const char* msg) {
 490   push_CPU_state();
 491 
 492   ExternalAddress message((address) msg);
 493   // push address of message
 494   pushptr(message.addr());
 495 
 496   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
 497   addl(rsp, wordSize);       // discard argument
 498   pop_CPU_state();
 499 }
 500 
 501 void MacroAssembler::print_state() {
 502   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 503   pusha();                                            // push registers
 504 
 505   push_CPU_state();
 506   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
 507   pop_CPU_state();
 508 
 509   popa();
 510   addl(rsp, wordSize);
 511 }
 512 
 513 #else // _LP64
 514 
 515 // 64 bit versions
 516 
 517 Address MacroAssembler::as_Address(AddressLiteral adr) {
 518   // amd64 always does this as a pc-rel
 519   // we can be absolute or disp based on the instruction type
 520   // jmp/call are displacements others are absolute
 521   assert(!adr.is_lval(), "must be rval");
 522   assert(reachable(adr), "must be");
 523   return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
 524 
 525 }
 526 
 527 Address MacroAssembler::as_Address(ArrayAddress adr) {
 528   AddressLiteral base = adr.base();
 529   lea(rscratch1, base);
 530   Address index = adr.index();
 531   assert(index._disp == 0, "must not have disp"); // maybe it can?
 532   Address array(rscratch1, index._index, index._scale, index._disp);
 533   return array;
 534 }
 535 
 536 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
 537   Label L, E;
 538 
 539 #ifdef _WIN64
 540   // Windows always allocates space for it's register args
 541   assert(num_args <= 4, "only register arguments supported");
 542   subq(rsp,  frame::arg_reg_save_area_bytes);
 543 #endif
 544 
 545   // Align stack if necessary
 546   testl(rsp, 15);
 547   jcc(Assembler::zero, L);
 548 
 549   subq(rsp, 8);
 550   {
 551     call(RuntimeAddress(entry_point));
 552   }
 553   addq(rsp, 8);
 554   jmp(E);
 555 
 556   bind(L);
 557   {
 558     call(RuntimeAddress(entry_point));
 559   }
 560 
 561   bind(E);
 562 
 563 #ifdef _WIN64
 564   // restore stack pointer
 565   addq(rsp, frame::arg_reg_save_area_bytes);
 566 #endif
 567 
 568 }
 569 
 570 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
 571   assert(!src2.is_lval(), "should use cmpptr");
 572 
 573   if (reachable(src2)) {
 574     cmpq(src1, as_Address(src2));
 575   } else {
 576     lea(rscratch1, src2);
 577     Assembler::cmpq(src1, Address(rscratch1, 0));
 578   }
 579 }
 580 
 581 int MacroAssembler::corrected_idivq(Register reg) {
 582   // Full implementation of Java ldiv and lrem; checks for special
 583   // case as described in JVM spec., p.243 & p.271.  The function
 584   // returns the (pc) offset of the idivl instruction - may be needed
 585   // for implicit exceptions.
 586   //
 587   //         normal case                           special case
 588   //
 589   // input : rax: dividend                         min_long
 590   //         reg: divisor   (may not be eax/edx)   -1
 591   //
 592   // output: rax: quotient  (= rax idiv reg)       min_long
 593   //         rdx: remainder (= rax irem reg)       0
 594   assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
 595   static const int64_t min_long = 0x8000000000000000;
 596   Label normal_case, special_case;
 597 
 598   // check for special case
 599   cmp64(rax, ExternalAddress((address) &min_long));
 600   jcc(Assembler::notEqual, normal_case);
 601   xorl(rdx, rdx); // prepare rdx for possible special case (where
 602                   // remainder = 0)
 603   cmpq(reg, -1);
 604   jcc(Assembler::equal, special_case);
 605 
 606   // handle normal case
 607   bind(normal_case);
 608   cdqq();
 609   int idivq_offset = offset();
 610   idivq(reg);
 611 
 612   // normal and special case exit
 613   bind(special_case);
 614 
 615   return idivq_offset;
 616 }
 617 
 618 void MacroAssembler::decrementq(Register reg, int value) {
 619   if (value == min_jint) { subq(reg, value); return; }
 620   if (value <  0) { incrementq(reg, -value); return; }
 621   if (value == 0) {                        ; return; }
 622   if (value == 1 && UseIncDec) { decq(reg) ; return; }
 623   /* else */      { subq(reg, value)       ; return; }
 624 }
 625 
 626 void MacroAssembler::decrementq(Address dst, int value) {
 627   if (value == min_jint) { subq(dst, value); return; }
 628   if (value <  0) { incrementq(dst, -value); return; }
 629   if (value == 0) {                        ; return; }
 630   if (value == 1 && UseIncDec) { decq(dst) ; return; }
 631   /* else */      { subq(dst, value)       ; return; }
 632 }
 633 
 634 void MacroAssembler::incrementq(AddressLiteral dst) {
 635   if (reachable(dst)) {
 636     incrementq(as_Address(dst));
 637   } else {
 638     lea(rscratch1, dst);
 639     incrementq(Address(rscratch1, 0));
 640   }
 641 }
 642 
 643 void MacroAssembler::incrementq(Register reg, int value) {
 644   if (value == min_jint) { addq(reg, value); return; }
 645   if (value <  0) { decrementq(reg, -value); return; }
 646   if (value == 0) {                        ; return; }
 647   if (value == 1 && UseIncDec) { incq(reg) ; return; }
 648   /* else */      { addq(reg, value)       ; return; }
 649 }
 650 
 651 void MacroAssembler::incrementq(Address dst, int value) {
 652   if (value == min_jint) { addq(dst, value); return; }
 653   if (value <  0) { decrementq(dst, -value); return; }
 654   if (value == 0) {                        ; return; }
 655   if (value == 1 && UseIncDec) { incq(dst) ; return; }
 656   /* else */      { addq(dst, value)       ; return; }
 657 }
 658 
 659 // 32bit can do a case table jump in one instruction but we no longer allow the base
 660 // to be installed in the Address class
 661 void MacroAssembler::jump(ArrayAddress entry) {
 662   lea(rscratch1, entry.base());
 663   Address dispatch = entry.index();
 664   assert(dispatch._base == noreg, "must be");
 665   dispatch._base = rscratch1;
 666   jmp(dispatch);
 667 }
 668 
 669 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 670   ShouldNotReachHere(); // 64bit doesn't use two regs
 671   cmpq(x_lo, y_lo);
 672 }
 673 
 674 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 675     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 676 }
 677 
 678 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 679   mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
 680   movptr(dst, rscratch1);
 681 }
 682 
 683 void MacroAssembler::leave() {
 684   // %%% is this really better? Why not on 32bit too?
 685   emit_int8((unsigned char)0xC9); // LEAVE
 686 }
 687 
 688 void MacroAssembler::lneg(Register hi, Register lo) {
 689   ShouldNotReachHere(); // 64bit doesn't use two regs
 690   negq(lo);
 691 }
 692 
 693 void MacroAssembler::movoop(Register dst, jobject obj) {
 694   mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 695 }
 696 
 697 void MacroAssembler::movoop(Address dst, jobject obj) {
 698   mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 699   movq(dst, rscratch1);
 700 }
 701 
 702 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 703   mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 704 }
 705 
 706 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 707   mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 708   movq(dst, rscratch1);
 709 }
 710 
 711 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 712   if (src.is_lval()) {
 713     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 714   } else {
 715     if (reachable(src)) {
 716       movq(dst, as_Address(src));
 717     } else {
 718       lea(scratch, src);
 719       movq(dst, Address(scratch, 0));
 720     }
 721   }
 722 }
 723 
 724 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 725   movq(as_Address(dst), src);
 726 }
 727 
 728 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 729   movq(dst, as_Address(src));
 730 }
 731 
 732 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 733 void MacroAssembler::movptr(Address dst, intptr_t src) {
 734   mov64(rscratch1, src);
 735   movq(dst, rscratch1);
 736 }
 737 
 738 // These are mostly for initializing NULL
 739 void MacroAssembler::movptr(Address dst, int32_t src) {
 740   movslq(dst, src);
 741 }
 742 
 743 void MacroAssembler::movptr(Register dst, int32_t src) {
 744   mov64(dst, (intptr_t)src);
 745 }
 746 
 747 void MacroAssembler::pushoop(jobject obj) {
 748   movoop(rscratch1, obj);
 749   push(rscratch1);
 750 }
 751 
 752 void MacroAssembler::pushklass(Metadata* obj) {
 753   mov_metadata(rscratch1, obj);
 754   push(rscratch1);
 755 }
 756 
 757 void MacroAssembler::pushptr(AddressLiteral src) {
 758   lea(rscratch1, src);
 759   if (src.is_lval()) {
 760     push(rscratch1);
 761   } else {
 762     pushq(Address(rscratch1, 0));
 763   }
 764 }
 765 
 766 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 767   // we must set sp to zero to clear frame
 768   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
 769   // must clear fp, so that compiled frames are not confused; it is
 770   // possible that we need it only for debugging
 771   if (clear_fp) {
 772     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
 773   }
 774 
 775   // Always clear the pc because it could have been set by make_walkable()
 776   movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
 777   vzeroupper();
 778 }
 779 
 780 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 781                                          Register last_java_fp,
 782                                          address  last_java_pc) {
 783   vzeroupper();
 784   // determine last_java_sp register
 785   if (!last_java_sp->is_valid()) {
 786     last_java_sp = rsp;
 787   }
 788 
 789   // last_java_fp is optional
 790   if (last_java_fp->is_valid()) {
 791     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
 792            last_java_fp);
 793   }
 794 
 795   // last_java_pc is optional
 796   if (last_java_pc != NULL) {
 797     Address java_pc(r15_thread,
 798                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
 799     lea(rscratch1, InternalAddress(last_java_pc));
 800     movptr(java_pc, rscratch1);
 801   }
 802 
 803   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
 804 }
 805 
 806 static void pass_arg0(MacroAssembler* masm, Register arg) {
 807   if (c_rarg0 != arg ) {
 808     masm->mov(c_rarg0, arg);
 809   }
 810 }
 811 
 812 static void pass_arg1(MacroAssembler* masm, Register arg) {
 813   if (c_rarg1 != arg ) {
 814     masm->mov(c_rarg1, arg);
 815   }
 816 }
 817 
 818 static void pass_arg2(MacroAssembler* masm, Register arg) {
 819   if (c_rarg2 != arg ) {
 820     masm->mov(c_rarg2, arg);
 821   }
 822 }
 823 
 824 static void pass_arg3(MacroAssembler* masm, Register arg) {
 825   if (c_rarg3 != arg ) {
 826     masm->mov(c_rarg3, arg);
 827   }
 828 }
 829 
 830 void MacroAssembler::stop(const char* msg) {
 831   address rip = pc();
 832   pusha(); // get regs on stack
 833   lea(c_rarg0, ExternalAddress((address) msg));
 834   lea(c_rarg1, InternalAddress(rip));
 835   movq(c_rarg2, rsp); // pass pointer to regs array
 836   andq(rsp, -16); // align stack as required by ABI
 837   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
 838   hlt();
 839 }
 840 
 841 void MacroAssembler::warn(const char* msg) {
 842   push(rbp);
 843   movq(rbp, rsp);
 844   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 845   push_CPU_state();   // keeps alignment at 16 bytes
 846   lea(c_rarg0, ExternalAddress((address) msg));
 847   lea(rax, ExternalAddress(CAST_FROM_FN_PTR(address, warning)));
 848   call(rax);
 849   pop_CPU_state();
 850   mov(rsp, rbp);
 851   pop(rbp);
 852 }
 853 
 854 void MacroAssembler::print_state() {
 855   address rip = pc();
 856   pusha();            // get regs on stack
 857   push(rbp);
 858   movq(rbp, rsp);
 859   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 860   push_CPU_state();   // keeps alignment at 16 bytes
 861 
 862   lea(c_rarg0, InternalAddress(rip));
 863   lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
 864   call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
 865 
 866   pop_CPU_state();
 867   mov(rsp, rbp);
 868   pop(rbp);
 869   popa();
 870 }
 871 
 872 #ifndef PRODUCT
 873 extern "C" void findpc(intptr_t x);
 874 #endif
 875 
 876 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
 877   // In order to get locks to work, we need to fake a in_VM state
 878   if (ShowMessageBoxOnError) {
 879     JavaThread* thread = JavaThread::current();
 880     JavaThreadState saved_state = thread->thread_state();
 881     thread->set_thread_state(_thread_in_vm);
 882 #ifndef PRODUCT
 883     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 884       ttyLocker ttyl;
 885       BytecodeCounter::print();
 886     }
 887 #endif
 888     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 889     // XXX correct this offset for amd64
 890     // This is the value of eip which points to where verify_oop will return.
 891     if (os::message_box(msg, "Execution stopped, print registers?")) {
 892       print_state64(pc, regs);
 893       BREAKPOINT;
 894       assert(false, "start up GDB");
 895     }
 896     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
 897   } else {
 898     ttyLocker ttyl;
 899     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
 900                     msg);
 901     assert(false, "DEBUG MESSAGE: %s", msg);
 902   }
 903 }
 904 
 905 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
 906   ttyLocker ttyl;
 907   FlagSetting fs(Debugging, true);
 908   tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
 909 #ifndef PRODUCT
 910   tty->cr();
 911   findpc(pc);
 912   tty->cr();
 913 #endif
 914 #define PRINT_REG(rax, value) \
 915   { tty->print("%s = ", #rax); os::print_location(tty, value); }
 916   PRINT_REG(rax, regs[15]);
 917   PRINT_REG(rbx, regs[12]);
 918   PRINT_REG(rcx, regs[14]);
 919   PRINT_REG(rdx, regs[13]);
 920   PRINT_REG(rdi, regs[8]);
 921   PRINT_REG(rsi, regs[9]);
 922   PRINT_REG(rbp, regs[10]);
 923   PRINT_REG(rsp, regs[11]);
 924   PRINT_REG(r8 , regs[7]);
 925   PRINT_REG(r9 , regs[6]);
 926   PRINT_REG(r10, regs[5]);
 927   PRINT_REG(r11, regs[4]);
 928   PRINT_REG(r12, regs[3]);
 929   PRINT_REG(r13, regs[2]);
 930   PRINT_REG(r14, regs[1]);
 931   PRINT_REG(r15, regs[0]);
 932 #undef PRINT_REG
 933   // Print some words near top of staack.
 934   int64_t* rsp = (int64_t*) regs[11];
 935   int64_t* dump_sp = rsp;
 936   for (int col1 = 0; col1 < 8; col1++) {
 937     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 938     os::print_location(tty, *dump_sp++);
 939   }
 940   for (int row = 0; row < 25; row++) {
 941     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 942     for (int col = 0; col < 4; col++) {
 943       tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
 944     }
 945     tty->cr();
 946   }
 947   // Print some instructions around pc:
 948   Disassembler::decode((address)pc-64, (address)pc);
 949   tty->print_cr("--------");
 950   Disassembler::decode((address)pc, (address)pc+32);
 951 }
 952 
 953 #endif // _LP64
 954 
 955 // Now versions that are common to 32/64 bit
 956 
 957 void MacroAssembler::addptr(Register dst, int32_t imm32) {
 958   LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
 959 }
 960 
 961 void MacroAssembler::addptr(Register dst, Register src) {
 962   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
 963 }
 964 
 965 void MacroAssembler::addptr(Address dst, Register src) {
 966   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
 967 }
 968 
 969 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
 970   if (reachable(src)) {
 971     Assembler::addsd(dst, as_Address(src));
 972   } else {
 973     lea(rscratch1, src);
 974     Assembler::addsd(dst, Address(rscratch1, 0));
 975   }
 976 }
 977 
 978 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
 979   if (reachable(src)) {
 980     addss(dst, as_Address(src));
 981   } else {
 982     lea(rscratch1, src);
 983     addss(dst, Address(rscratch1, 0));
 984   }
 985 }
 986 
 987 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src) {
 988   if (reachable(src)) {
 989     Assembler::addpd(dst, as_Address(src));
 990   } else {
 991     lea(rscratch1, src);
 992     Assembler::addpd(dst, Address(rscratch1, 0));
 993   }
 994 }
 995 
 996 void MacroAssembler::align(int modulus) {
 997   align(modulus, offset());
 998 }
 999 
1000 void MacroAssembler::align(int modulus, int target) {
1001   if (target % modulus != 0) {
1002     nop(modulus - (target % modulus));
1003   }
1004 }
1005 
1006 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src) {
1007   // Used in sign-masking with aligned address.
1008   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1009   if (reachable(src)) {
1010     Assembler::andpd(dst, as_Address(src));
1011   } else {
1012     lea(rscratch1, src);
1013     Assembler::andpd(dst, Address(rscratch1, 0));
1014   }
1015 }
1016 
1017 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src) {
1018   // Used in sign-masking with aligned address.
1019   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1020   if (reachable(src)) {
1021     Assembler::andps(dst, as_Address(src));
1022   } else {
1023     lea(rscratch1, src);
1024     Assembler::andps(dst, Address(rscratch1, 0));
1025   }
1026 }
1027 
1028 void MacroAssembler::andptr(Register dst, int32_t imm32) {
1029   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1030 }
1031 
1032 void MacroAssembler::atomic_incl(Address counter_addr) {
1033   if (os::is_MP())
1034     lock();
1035   incrementl(counter_addr);
1036 }
1037 
1038 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1039   if (reachable(counter_addr)) {
1040     atomic_incl(as_Address(counter_addr));
1041   } else {
1042     lea(scr, counter_addr);
1043     atomic_incl(Address(scr, 0));
1044   }
1045 }
1046 
1047 #ifdef _LP64
1048 void MacroAssembler::atomic_incq(Address counter_addr) {
1049   if (os::is_MP())
1050     lock();
1051   incrementq(counter_addr);
1052 }
1053 
1054 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1055   if (reachable(counter_addr)) {
1056     atomic_incq(as_Address(counter_addr));
1057   } else {
1058     lea(scr, counter_addr);
1059     atomic_incq(Address(scr, 0));
1060   }
1061 }
1062 #endif
1063 
1064 // Writes to stack successive pages until offset reached to check for
1065 // stack overflow + shadow pages.  This clobbers tmp.
1066 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1067   movptr(tmp, rsp);
1068   // Bang stack for total size given plus shadow page size.
1069   // Bang one page at a time because large size can bang beyond yellow and
1070   // red zones.
1071   Label loop;
1072   bind(loop);
1073   movl(Address(tmp, (-os::vm_page_size())), size );
1074   subptr(tmp, os::vm_page_size());
1075   subl(size, os::vm_page_size());
1076   jcc(Assembler::greater, loop);
1077 
1078   // Bang down shadow pages too.
1079   // At this point, (tmp-0) is the last address touched, so don't
1080   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1081   // was post-decremented.)  Skip this address by starting at i=1, and
1082   // touch a few more pages below.  N.B.  It is important to touch all
1083   // the way down including all pages in the shadow zone.
1084   for (int i = 1; i < ((int)JavaThread::stack_shadow_zone_size() / os::vm_page_size()); i++) {
1085     // this could be any sized move but this is can be a debugging crumb
1086     // so the bigger the better.
1087     movptr(Address(tmp, (-i*os::vm_page_size())), size );
1088   }
1089 }
1090 
1091 void MacroAssembler::reserved_stack_check() {
1092     // testing if reserved zone needs to be enabled
1093     Label no_reserved_zone_enabling;
1094     Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1095     NOT_LP64(get_thread(rsi);)
1096 
1097     cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1098     jcc(Assembler::below, no_reserved_zone_enabling);
1099 
1100     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1101     jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1102     should_not_reach_here();
1103 
1104     bind(no_reserved_zone_enabling);
1105 }
1106 
1107 int MacroAssembler::biased_locking_enter(Register lock_reg,
1108                                          Register obj_reg,
1109                                          Register swap_reg,
1110                                          Register tmp_reg,
1111                                          bool swap_reg_contains_mark,
1112                                          Label& done,
1113                                          Label* slow_case,
1114                                          BiasedLockingCounters* counters) {
1115   assert(UseBiasedLocking, "why call this otherwise?");
1116   assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
1117   assert(tmp_reg != noreg, "tmp_reg must be supplied");
1118   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
1119   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
1120   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
1121   NOT_LP64( Address saved_mark_addr(lock_reg, 0); )
1122 
1123   if (PrintBiasedLockingStatistics && counters == NULL) {
1124     counters = BiasedLocking::counters();
1125   }
1126   // Biased locking
1127   // See whether the lock is currently biased toward our thread and
1128   // whether the epoch is still valid
1129   // Note that the runtime guarantees sufficient alignment of JavaThread
1130   // pointers to allow age to be placed into low bits
1131   // First check to see whether biasing is even enabled for this object
1132   Label cas_label;
1133   int null_check_offset = -1;
1134   if (!swap_reg_contains_mark) {
1135     null_check_offset = offset();
1136     movptr(swap_reg, mark_addr);
1137   }
1138   movptr(tmp_reg, swap_reg);
1139   andptr(tmp_reg, markOopDesc::biased_lock_mask_in_place);
1140   cmpptr(tmp_reg, markOopDesc::biased_lock_pattern);
1141   jcc(Assembler::notEqual, cas_label);
1142   // The bias pattern is present in the object's header. Need to check
1143   // whether the bias owner and the epoch are both still current.
1144 #ifndef _LP64
1145   // Note that because there is no current thread register on x86_32 we
1146   // need to store off the mark word we read out of the object to
1147   // avoid reloading it and needing to recheck invariants below. This
1148   // store is unfortunate but it makes the overall code shorter and
1149   // simpler.
1150   movptr(saved_mark_addr, swap_reg);
1151 #endif
1152   if (swap_reg_contains_mark) {
1153     null_check_offset = offset();
1154   }
1155   load_prototype_header(tmp_reg, obj_reg);
1156 #ifdef _LP64
1157   orptr(tmp_reg, r15_thread);
1158   xorptr(tmp_reg, swap_reg);
1159   Register header_reg = tmp_reg;
1160 #else
1161   xorptr(tmp_reg, swap_reg);
1162   get_thread(swap_reg);
1163   xorptr(swap_reg, tmp_reg);
1164   Register header_reg = swap_reg;
1165 #endif
1166   andptr(header_reg, ~((int) markOopDesc::age_mask_in_place));
1167   if (counters != NULL) {
1168     cond_inc32(Assembler::zero,
1169                ExternalAddress((address) counters->biased_lock_entry_count_addr()));
1170   }
1171   jcc(Assembler::equal, done);
1172 
1173   Label try_revoke_bias;
1174   Label try_rebias;
1175 
1176   // At this point we know that the header has the bias pattern and
1177   // that we are not the bias owner in the current epoch. We need to
1178   // figure out more details about the state of the header in order to
1179   // know what operations can be legally performed on the object's
1180   // header.
1181 
1182   // If the low three bits in the xor result aren't clear, that means
1183   // the prototype header is no longer biased and we have to revoke
1184   // the bias on this object.
1185   testptr(header_reg, markOopDesc::biased_lock_mask_in_place);
1186   jccb(Assembler::notZero, try_revoke_bias);
1187 
1188   // Biasing is still enabled for this data type. See whether the
1189   // epoch of the current bias is still valid, meaning that the epoch
1190   // bits of the mark word are equal to the epoch bits of the
1191   // prototype header. (Note that the prototype header's epoch bits
1192   // only change at a safepoint.) If not, attempt to rebias the object
1193   // toward the current thread. Note that we must be absolutely sure
1194   // that the current epoch is invalid in order to do this because
1195   // otherwise the manipulations it performs on the mark word are
1196   // illegal.
1197   testptr(header_reg, markOopDesc::epoch_mask_in_place);
1198   jccb(Assembler::notZero, try_rebias);
1199 
1200   // The epoch of the current bias is still valid but we know nothing
1201   // about the owner; it might be set or it might be clear. Try to
1202   // acquire the bias of the object using an atomic operation. If this
1203   // fails we will go in to the runtime to revoke the object's bias.
1204   // Note that we first construct the presumed unbiased header so we
1205   // don't accidentally blow away another thread's valid bias.
1206   NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1207   andptr(swap_reg,
1208          markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
1209 #ifdef _LP64
1210   movptr(tmp_reg, swap_reg);
1211   orptr(tmp_reg, r15_thread);
1212 #else
1213   get_thread(tmp_reg);
1214   orptr(tmp_reg, swap_reg);
1215 #endif
1216   if (os::is_MP()) {
1217     lock();
1218   }
1219   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1220   // If the biasing toward our thread failed, this means that
1221   // another thread succeeded in biasing it toward itself and we
1222   // need to revoke that bias. The revocation will occur in the
1223   // interpreter runtime in the slow case.
1224   if (counters != NULL) {
1225     cond_inc32(Assembler::zero,
1226                ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
1227   }
1228   if (slow_case != NULL) {
1229     jcc(Assembler::notZero, *slow_case);
1230   }
1231   jmp(done);
1232 
1233   bind(try_rebias);
1234   // At this point we know the epoch has expired, meaning that the
1235   // current "bias owner", if any, is actually invalid. Under these
1236   // circumstances _only_, we are allowed to use the current header's
1237   // value as the comparison value when doing the cas to acquire the
1238   // bias in the current epoch. In other words, we allow transfer of
1239   // the bias from one thread to another directly in this situation.
1240   //
1241   // FIXME: due to a lack of registers we currently blow away the age
1242   // bits in this situation. Should attempt to preserve them.
1243   load_prototype_header(tmp_reg, obj_reg);
1244 #ifdef _LP64
1245   orptr(tmp_reg, r15_thread);
1246 #else
1247   get_thread(swap_reg);
1248   orptr(tmp_reg, swap_reg);
1249   movptr(swap_reg, saved_mark_addr);
1250 #endif
1251   if (os::is_MP()) {
1252     lock();
1253   }
1254   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1255   // If the biasing toward our thread failed, then another thread
1256   // succeeded in biasing it toward itself and we need to revoke that
1257   // bias. The revocation will occur in the runtime in the slow case.
1258   if (counters != NULL) {
1259     cond_inc32(Assembler::zero,
1260                ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
1261   }
1262   if (slow_case != NULL) {
1263     jcc(Assembler::notZero, *slow_case);
1264   }
1265   jmp(done);
1266 
1267   bind(try_revoke_bias);
1268   // The prototype mark in the klass doesn't have the bias bit set any
1269   // more, indicating that objects of this data type are not supposed
1270   // to be biased any more. We are going to try to reset the mark of
1271   // this object to the prototype value and fall through to the
1272   // CAS-based locking scheme. Note that if our CAS fails, it means
1273   // that another thread raced us for the privilege of revoking the
1274   // bias of this particular object, so it's okay to continue in the
1275   // normal locking code.
1276   //
1277   // FIXME: due to a lack of registers we currently blow away the age
1278   // bits in this situation. Should attempt to preserve them.
1279   NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1280   load_prototype_header(tmp_reg, obj_reg);
1281   if (os::is_MP()) {
1282     lock();
1283   }
1284   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1285   // Fall through to the normal CAS-based lock, because no matter what
1286   // the result of the above CAS, some thread must have succeeded in
1287   // removing the bias bit from the object's header.
1288   if (counters != NULL) {
1289     cond_inc32(Assembler::zero,
1290                ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
1291   }
1292 
1293   bind(cas_label);
1294 
1295   return null_check_offset;
1296 }
1297 
1298 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1299   assert(UseBiasedLocking, "why call this otherwise?");
1300 
1301   // Check for biased locking unlock case, which is a no-op
1302   // Note: we do not have to check the thread ID for two reasons.
1303   // First, the interpreter checks for IllegalMonitorStateException at
1304   // a higher level. Second, if the bias was revoked while we held the
1305   // lock, the object could not be rebiased toward another thread, so
1306   // the bias bit would be clear.
1307   movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1308   andptr(temp_reg, markOopDesc::biased_lock_mask_in_place);
1309   cmpptr(temp_reg, markOopDesc::biased_lock_pattern);
1310   jcc(Assembler::equal, done);
1311 }
1312 
1313 #ifdef COMPILER2
1314 
1315 #if INCLUDE_RTM_OPT
1316 
1317 // Update rtm_counters based on abort status
1318 // input: abort_status
1319 //        rtm_counters (RTMLockingCounters*)
1320 // flags are killed
1321 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
1322 
1323   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
1324   if (PrintPreciseRTMLockingStatistics) {
1325     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
1326       Label check_abort;
1327       testl(abort_status, (1<<i));
1328       jccb(Assembler::equal, check_abort);
1329       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
1330       bind(check_abort);
1331     }
1332   }
1333 }
1334 
1335 // Branch if (random & (count-1) != 0), count is 2^n
1336 // tmp, scr and flags are killed
1337 void MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
1338   assert(tmp == rax, "");
1339   assert(scr == rdx, "");
1340   rdtsc(); // modifies EDX:EAX
1341   andptr(tmp, count-1);
1342   jccb(Assembler::notZero, brLabel);
1343 }
1344 
1345 // Perform abort ratio calculation, set no_rtm bit if high ratio
1346 // input:  rtm_counters_Reg (RTMLockingCounters* address)
1347 // tmpReg, rtm_counters_Reg and flags are killed
1348 void MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
1349                                                  Register rtm_counters_Reg,
1350                                                  RTMLockingCounters* rtm_counters,
1351                                                  Metadata* method_data) {
1352   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
1353 
1354   if (RTMLockingCalculationDelay > 0) {
1355     // Delay calculation
1356     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
1357     testptr(tmpReg, tmpReg);
1358     jccb(Assembler::equal, L_done);
1359   }
1360   // Abort ratio calculation only if abort_count > RTMAbortThreshold
1361   //   Aborted transactions = abort_count * 100
1362   //   All transactions = total_count *  RTMTotalCountIncrRate
1363   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
1364 
1365   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
1366   cmpptr(tmpReg, RTMAbortThreshold);
1367   jccb(Assembler::below, L_check_always_rtm2);
1368   imulptr(tmpReg, tmpReg, 100);
1369 
1370   Register scrReg = rtm_counters_Reg;
1371   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1372   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
1373   imulptr(scrReg, scrReg, RTMAbortRatio);
1374   cmpptr(tmpReg, scrReg);
1375   jccb(Assembler::below, L_check_always_rtm1);
1376   if (method_data != NULL) {
1377     // set rtm_state to "no rtm" in MDO
1378     mov_metadata(tmpReg, method_data);
1379     if (os::is_MP()) {
1380       lock();
1381     }
1382     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
1383   }
1384   jmpb(L_done);
1385   bind(L_check_always_rtm1);
1386   // Reload RTMLockingCounters* address
1387   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1388   bind(L_check_always_rtm2);
1389   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1390   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
1391   jccb(Assembler::below, L_done);
1392   if (method_data != NULL) {
1393     // set rtm_state to "always rtm" in MDO
1394     mov_metadata(tmpReg, method_data);
1395     if (os::is_MP()) {
1396       lock();
1397     }
1398     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
1399   }
1400   bind(L_done);
1401 }
1402 
1403 // Update counters and perform abort ratio calculation
1404 // input:  abort_status_Reg
1405 // rtm_counters_Reg, flags are killed
1406 void MacroAssembler::rtm_profiling(Register abort_status_Reg,
1407                                    Register rtm_counters_Reg,
1408                                    RTMLockingCounters* rtm_counters,
1409                                    Metadata* method_data,
1410                                    bool profile_rtm) {
1411 
1412   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1413   // update rtm counters based on rax value at abort
1414   // reads abort_status_Reg, updates flags
1415   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1416   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
1417   if (profile_rtm) {
1418     // Save abort status because abort_status_Reg is used by following code.
1419     if (RTMRetryCount > 0) {
1420       push(abort_status_Reg);
1421     }
1422     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1423     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
1424     // restore abort status
1425     if (RTMRetryCount > 0) {
1426       pop(abort_status_Reg);
1427     }
1428   }
1429 }
1430 
1431 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
1432 // inputs: retry_count_Reg
1433 //       : abort_status_Reg
1434 // output: retry_count_Reg decremented by 1
1435 // flags are killed
1436 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
1437   Label doneRetry;
1438   assert(abort_status_Reg == rax, "");
1439   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
1440   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
1441   // if reason is in 0x6 and retry count != 0 then retry
1442   andptr(abort_status_Reg, 0x6);
1443   jccb(Assembler::zero, doneRetry);
1444   testl(retry_count_Reg, retry_count_Reg);
1445   jccb(Assembler::zero, doneRetry);
1446   pause();
1447   decrementl(retry_count_Reg);
1448   jmp(retryLabel);
1449   bind(doneRetry);
1450 }
1451 
1452 // Spin and retry if lock is busy,
1453 // inputs: box_Reg (monitor address)
1454 //       : retry_count_Reg
1455 // output: retry_count_Reg decremented by 1
1456 //       : clear z flag if retry count exceeded
1457 // tmp_Reg, scr_Reg, flags are killed
1458 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
1459                                             Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
1460   Label SpinLoop, SpinExit, doneRetry;
1461   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1462 
1463   testl(retry_count_Reg, retry_count_Reg);
1464   jccb(Assembler::zero, doneRetry);
1465   decrementl(retry_count_Reg);
1466   movptr(scr_Reg, RTMSpinLoopCount);
1467 
1468   bind(SpinLoop);
1469   pause();
1470   decrementl(scr_Reg);
1471   jccb(Assembler::lessEqual, SpinExit);
1472   movptr(tmp_Reg, Address(box_Reg, owner_offset));
1473   testptr(tmp_Reg, tmp_Reg);
1474   jccb(Assembler::notZero, SpinLoop);
1475 
1476   bind(SpinExit);
1477   jmp(retryLabel);
1478   bind(doneRetry);
1479   incrementl(retry_count_Reg); // clear z flag
1480 }
1481 
1482 // Use RTM for normal stack locks
1483 // Input: objReg (object to lock)
1484 void MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
1485                                        Register retry_on_abort_count_Reg,
1486                                        RTMLockingCounters* stack_rtm_counters,
1487                                        Metadata* method_data, bool profile_rtm,
1488                                        Label& DONE_LABEL, Label& IsInflated) {
1489   assert(UseRTMForStackLocks, "why call this otherwise?");
1490   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1491   assert(tmpReg == rax, "");
1492   assert(scrReg == rdx, "");
1493   Label L_rtm_retry, L_decrement_retry, L_on_abort;
1494 
1495   if (RTMRetryCount > 0) {
1496     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1497     bind(L_rtm_retry);
1498   }
1499   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
1500   testptr(tmpReg, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
1501   jcc(Assembler::notZero, IsInflated);
1502 
1503   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1504     Label L_noincrement;
1505     if (RTMTotalCountIncrRate > 1) {
1506       // tmpReg, scrReg and flags are killed
1507       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
1508     }
1509     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
1510     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
1511     bind(L_noincrement);
1512   }
1513   xbegin(L_on_abort);
1514   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
1515   andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
1516   cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
1517   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
1518 
1519   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1520   if (UseRTMXendForLockBusy) {
1521     xend();
1522     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
1523     jmp(L_decrement_retry);
1524   }
1525   else {
1526     xabort(0);
1527   }
1528   bind(L_on_abort);
1529   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1530     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
1531   }
1532   bind(L_decrement_retry);
1533   if (RTMRetryCount > 0) {
1534     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1535     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1536   }
1537 }
1538 
1539 // Use RTM for inflating locks
1540 // inputs: objReg (object to lock)
1541 //         boxReg (on-stack box address (displaced header location) - KILLED)
1542 //         tmpReg (ObjectMonitor address + markOopDesc::monitor_value)
1543 void MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
1544                                           Register scrReg, Register retry_on_busy_count_Reg,
1545                                           Register retry_on_abort_count_Reg,
1546                                           RTMLockingCounters* rtm_counters,
1547                                           Metadata* method_data, bool profile_rtm,
1548                                           Label& DONE_LABEL) {
1549   assert(UseRTMLocking, "why call this otherwise?");
1550   assert(tmpReg == rax, "");
1551   assert(scrReg == rdx, "");
1552   Label L_rtm_retry, L_decrement_retry, L_on_abort;
1553   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1554 
1555   // Without cast to int32_t a movptr will destroy r10 which is typically obj
1556   movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1557   movptr(boxReg, tmpReg); // Save ObjectMonitor address
1558 
1559   if (RTMRetryCount > 0) {
1560     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
1561     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1562     bind(L_rtm_retry);
1563   }
1564   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1565     Label L_noincrement;
1566     if (RTMTotalCountIncrRate > 1) {
1567       // tmpReg, scrReg and flags are killed
1568       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
1569     }
1570     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1571     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
1572     bind(L_noincrement);
1573   }
1574   xbegin(L_on_abort);
1575   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
1576   movptr(tmpReg, Address(tmpReg, owner_offset));
1577   testptr(tmpReg, tmpReg);
1578   jcc(Assembler::zero, DONE_LABEL);
1579   if (UseRTMXendForLockBusy) {
1580     xend();
1581     jmp(L_decrement_retry);
1582   }
1583   else {
1584     xabort(0);
1585   }
1586   bind(L_on_abort);
1587   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1588   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1589     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
1590   }
1591   if (RTMRetryCount > 0) {
1592     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1593     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1594   }
1595 
1596   movptr(tmpReg, Address(boxReg, owner_offset)) ;
1597   testptr(tmpReg, tmpReg) ;
1598   jccb(Assembler::notZero, L_decrement_retry) ;
1599 
1600   // Appears unlocked - try to swing _owner from null to non-null.
1601   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1602 #ifdef _LP64
1603   Register threadReg = r15_thread;
1604 #else
1605   get_thread(scrReg);
1606   Register threadReg = scrReg;
1607 #endif
1608   if (os::is_MP()) {
1609     lock();
1610   }
1611   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
1612 
1613   if (RTMRetryCount > 0) {
1614     // success done else retry
1615     jccb(Assembler::equal, DONE_LABEL) ;
1616     bind(L_decrement_retry);
1617     // Spin and retry if lock is busy.
1618     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
1619   }
1620   else {
1621     bind(L_decrement_retry);
1622   }
1623 }
1624 
1625 #endif //  INCLUDE_RTM_OPT
1626 
1627 // Fast_Lock and Fast_Unlock used by C2
1628 
1629 // Because the transitions from emitted code to the runtime
1630 // monitorenter/exit helper stubs are so slow it's critical that
1631 // we inline both the stack-locking fast-path and the inflated fast path.
1632 //
1633 // See also: cmpFastLock and cmpFastUnlock.
1634 //
1635 // What follows is a specialized inline transliteration of the code
1636 // in slow_enter() and slow_exit().  If we're concerned about I$ bloat
1637 // another option would be to emit TrySlowEnter and TrySlowExit methods
1638 // at startup-time.  These methods would accept arguments as
1639 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
1640 // indications in the icc.ZFlag.  Fast_Lock and Fast_Unlock would simply
1641 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
1642 // In practice, however, the # of lock sites is bounded and is usually small.
1643 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
1644 // if the processor uses simple bimodal branch predictors keyed by EIP
1645 // Since the helper routines would be called from multiple synchronization
1646 // sites.
1647 //
1648 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
1649 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
1650 // to those specialized methods.  That'd give us a mostly platform-independent
1651 // implementation that the JITs could optimize and inline at their pleasure.
1652 // Done correctly, the only time we'd need to cross to native could would be
1653 // to park() or unpark() threads.  We'd also need a few more unsafe operators
1654 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
1655 // (b) explicit barriers or fence operations.
1656 //
1657 // TODO:
1658 //
1659 // *  Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
1660 //    This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
1661 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
1662 //    the lock operators would typically be faster than reifying Self.
1663 //
1664 // *  Ideally I'd define the primitives as:
1665 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
1666 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
1667 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
1668 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
1669 //    Furthermore the register assignments are overconstrained, possibly resulting in
1670 //    sub-optimal code near the synchronization site.
1671 //
1672 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
1673 //    Alternately, use a better sp-proximity test.
1674 //
1675 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
1676 //    Either one is sufficient to uniquely identify a thread.
1677 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
1678 //
1679 // *  Intrinsify notify() and notifyAll() for the common cases where the
1680 //    object is locked by the calling thread but the waitlist is empty.
1681 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
1682 //
1683 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
1684 //    But beware of excessive branch density on AMD Opterons.
1685 //
1686 // *  Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
1687 //    or failure of the fast-path.  If the fast-path fails then we pass
1688 //    control to the slow-path, typically in C.  In Fast_Lock and
1689 //    Fast_Unlock we often branch to DONE_LABEL, just to find that C2
1690 //    will emit a conditional branch immediately after the node.
1691 //    So we have branches to branches and lots of ICC.ZF games.
1692 //    Instead, it might be better to have C2 pass a "FailureLabel"
1693 //    into Fast_Lock and Fast_Unlock.  In the case of success, control
1694 //    will drop through the node.  ICC.ZF is undefined at exit.
1695 //    In the case of failure, the node will branch directly to the
1696 //    FailureLabel
1697 
1698 
1699 // obj: object to lock
1700 // box: on-stack box address (displaced header location) - KILLED
1701 // rax,: tmp -- KILLED
1702 // scr: tmp -- KILLED
1703 void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
1704                                Register scrReg, Register cx1Reg, Register cx2Reg,
1705                                BiasedLockingCounters* counters,
1706                                RTMLockingCounters* rtm_counters,
1707                                RTMLockingCounters* stack_rtm_counters,
1708                                Metadata* method_data,
1709                                bool use_rtm, bool profile_rtm) {
1710   // Ensure the register assignments are disjoint
1711   assert(tmpReg == rax, "");
1712 
1713   if (use_rtm) {
1714     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
1715   } else {
1716     assert(cx1Reg == noreg, "");
1717     assert(cx2Reg == noreg, "");
1718     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
1719   }
1720 
1721   if (counters != NULL) {
1722     atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
1723   }
1724 
1725   // Possible cases that we'll encounter in fast_lock
1726   // ------------------------------------------------
1727   // * Inflated
1728   //    -- unlocked
1729   //    -- Locked
1730   //       = by self
1731   //       = by other
1732   // * biased
1733   //    -- by Self
1734   //    -- by other
1735   // * neutral
1736   // * stack-locked
1737   //    -- by self
1738   //       = sp-proximity test hits
1739   //       = sp-proximity test generates false-negative
1740   //    -- by other
1741   //
1742 
1743   Label IsInflated, DONE_LABEL;
1744 
1745   // it's stack-locked, biased or neutral
1746   // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
1747   // order to reduce the number of conditional branches in the most common cases.
1748   // Beware -- there's a subtle invariant that fetch of the markword
1749   // at [FETCH], below, will never observe a biased encoding (*101b).
1750   // If this invariant is not held we risk exclusion (safety) failure.
1751   if (UseBiasedLocking && !UseOptoBiasInlining) {
1752     biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
1753   }
1754 
1755 #if INCLUDE_RTM_OPT
1756   if (UseRTMForStackLocks && use_rtm) {
1757     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
1758                       stack_rtm_counters, method_data, profile_rtm,
1759                       DONE_LABEL, IsInflated);
1760   }
1761 #endif // INCLUDE_RTM_OPT
1762 
1763   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
1764   testptr(tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased
1765   jccb(Assembler::notZero, IsInflated);
1766 
1767   // Attempt stack-locking ...
1768   orptr (tmpReg, markOopDesc::unlocked_value);
1769   movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
1770   if (os::is_MP()) {
1771     lock();
1772   }
1773   cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
1774   if (counters != NULL) {
1775     cond_inc32(Assembler::equal,
1776                ExternalAddress((address)counters->fast_path_entry_count_addr()));
1777   }
1778   jcc(Assembler::equal, DONE_LABEL);           // Success
1779 
1780   // Recursive locking.
1781   // The object is stack-locked: markword contains stack pointer to BasicLock.
1782   // Locked by current thread if difference with current SP is less than one page.
1783   subptr(tmpReg, rsp);
1784   // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
1785   andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
1786   movptr(Address(boxReg, 0), tmpReg);
1787   if (counters != NULL) {
1788     cond_inc32(Assembler::equal,
1789                ExternalAddress((address)counters->fast_path_entry_count_addr()));
1790   }
1791   jmp(DONE_LABEL);
1792 
1793   bind(IsInflated);
1794   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markOopDesc::monitor_value
1795 
1796 #if INCLUDE_RTM_OPT
1797   // Use the same RTM locking code in 32- and 64-bit VM.
1798   if (use_rtm) {
1799     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
1800                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
1801   } else {
1802 #endif // INCLUDE_RTM_OPT
1803 
1804 #ifndef _LP64
1805   // The object is inflated.
1806 
1807   // boxReg refers to the on-stack BasicLock in the current frame.
1808   // We'd like to write:
1809   //   set box->_displaced_header = markOopDesc::unused_mark().  Any non-0 value suffices.
1810   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
1811   // additional latency as we have another ST in the store buffer that must drain.
1812 
1813   // avoid ST-before-CAS
1814   // register juggle because we need tmpReg for cmpxchgptr below
1815   movptr(scrReg, boxReg);
1816   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
1817 
1818   // Optimistic form: consider XORL tmpReg,tmpReg
1819   movptr(tmpReg, NULL_WORD);
1820 
1821   // Appears unlocked - try to swing _owner from null to non-null.
1822   // Ideally, I'd manifest "Self" with get_thread and then attempt
1823   // to CAS the register containing Self into m->Owner.
1824   // But we don't have enough registers, so instead we can either try to CAS
1825   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
1826   // we later store "Self" into m->Owner.  Transiently storing a stack address
1827   // (rsp or the address of the box) into  m->owner is harmless.
1828   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1829   if (os::is_MP()) {
1830     lock();
1831   }
1832   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1833   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
1834   // If we weren't able to swing _owner from NULL to the BasicLock
1835   // then take the slow path.
1836   jccb  (Assembler::notZero, DONE_LABEL);
1837   // update _owner from BasicLock to thread
1838   get_thread (scrReg);                    // beware: clobbers ICCs
1839   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
1840   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
1841 
1842   // If the CAS fails we can either retry or pass control to the slow-path.
1843   // We use the latter tactic.
1844   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1845   // If the CAS was successful ...
1846   //   Self has acquired the lock
1847   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1848   // Intentional fall-through into DONE_LABEL ...
1849 #else // _LP64
1850   // It's inflated
1851   movq(scrReg, tmpReg);
1852   xorq(tmpReg, tmpReg);
1853 
1854   if (os::is_MP()) {
1855     lock();
1856   }
1857   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1858   // Unconditionally set box->_displaced_header = markOopDesc::unused_mark().
1859   // Without cast to int32_t movptr will destroy r10 which is typically obj.
1860   movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1861   // Intentional fall-through into DONE_LABEL ...
1862   // Propagate ICC.ZF from CAS above into DONE_LABEL.
1863 #endif // _LP64
1864 #if INCLUDE_RTM_OPT
1865   } // use_rtm()
1866 #endif
1867   // DONE_LABEL is a hot target - we'd really like to place it at the
1868   // start of cache line by padding with NOPs.
1869   // See the AMD and Intel software optimization manuals for the
1870   // most efficient "long" NOP encodings.
1871   // Unfortunately none of our alignment mechanisms suffice.
1872   bind(DONE_LABEL);
1873 
1874   // At DONE_LABEL the icc ZFlag is set as follows ...
1875   // Fast_Unlock uses the same protocol.
1876   // ZFlag == 1 -> Success
1877   // ZFlag == 0 -> Failure - force control through the slow-path
1878 }
1879 
1880 // obj: object to unlock
1881 // box: box address (displaced header location), killed.  Must be EAX.
1882 // tmp: killed, cannot be obj nor box.
1883 //
1884 // Some commentary on balanced locking:
1885 //
1886 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
1887 // Methods that don't have provably balanced locking are forced to run in the
1888 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1889 // The interpreter provides two properties:
1890 // I1:  At return-time the interpreter automatically and quietly unlocks any
1891 //      objects acquired the current activation (frame).  Recall that the
1892 //      interpreter maintains an on-stack list of locks currently held by
1893 //      a frame.
1894 // I2:  If a method attempts to unlock an object that is not held by the
1895 //      the frame the interpreter throws IMSX.
1896 //
1897 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
1898 // B() doesn't have provably balanced locking so it runs in the interpreter.
1899 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
1900 // is still locked by A().
1901 //
1902 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
1903 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
1904 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
1905 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
1906 // Arguably given that the spec legislates the JNI case as undefined our implementation
1907 // could reasonably *avoid* checking owner in Fast_Unlock().
1908 // In the interest of performance we elide m->Owner==Self check in unlock.
1909 // A perfectly viable alternative is to elide the owner check except when
1910 // Xcheck:jni is enabled.
1911 
1912 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
1913   assert(boxReg == rax, "");
1914   assert_different_registers(objReg, boxReg, tmpReg);
1915 
1916   Label DONE_LABEL, Stacked, CheckSucc;
1917 
1918   // Critically, the biased locking test must have precedence over
1919   // and appear before the (box->dhw == 0) recursive stack-lock test.
1920   if (UseBiasedLocking && !UseOptoBiasInlining) {
1921     biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1922   }
1923 
1924 #if INCLUDE_RTM_OPT
1925   if (UseRTMForStackLocks && use_rtm) {
1926     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1927     Label L_regular_unlock;
1928     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));           // fetch markword
1929     andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
1930     cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
1931     jccb(Assembler::notEqual, L_regular_unlock);  // if !HLE RegularLock
1932     xend();                                       // otherwise end...
1933     jmp(DONE_LABEL);                              // ... and we're done
1934     bind(L_regular_unlock);
1935   }
1936 #endif
1937 
1938   cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
1939   jcc   (Assembler::zero, DONE_LABEL);            // 0 indicates recursive stack-lock
1940   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));             // Examine the object's markword
1941   testptr(tmpReg, markOopDesc::monitor_value);    // Inflated?
1942   jccb  (Assembler::zero, Stacked);
1943 
1944   // It's inflated.
1945 #if INCLUDE_RTM_OPT
1946   if (use_rtm) {
1947     Label L_regular_inflated_unlock;
1948     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1949     movptr(boxReg, Address(tmpReg, owner_offset));
1950     testptr(boxReg, boxReg);
1951     jccb(Assembler::notZero, L_regular_inflated_unlock);
1952     xend();
1953     jmpb(DONE_LABEL);
1954     bind(L_regular_inflated_unlock);
1955   }
1956 #endif
1957 
1958   // Despite our balanced locking property we still check that m->_owner == Self
1959   // as java routines or native JNI code called by this thread might
1960   // have released the lock.
1961   // Refer to the comments in synchronizer.cpp for how we might encode extra
1962   // state in _succ so we can avoid fetching EntryList|cxq.
1963   //
1964   // I'd like to add more cases in fast_lock() and fast_unlock() --
1965   // such as recursive enter and exit -- but we have to be wary of
1966   // I$ bloat, T$ effects and BP$ effects.
1967   //
1968   // If there's no contention try a 1-0 exit.  That is, exit without
1969   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
1970   // we detect and recover from the race that the 1-0 exit admits.
1971   //
1972   // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
1973   // before it STs null into _owner, releasing the lock.  Updates
1974   // to data protected by the critical section must be visible before
1975   // we drop the lock (and thus before any other thread could acquire
1976   // the lock and observe the fields protected by the lock).
1977   // IA32's memory-model is SPO, so STs are ordered with respect to
1978   // each other and there's no need for an explicit barrier (fence).
1979   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
1980 #ifndef _LP64
1981   get_thread (boxReg);
1982 
1983   // Note that we could employ various encoding schemes to reduce
1984   // the number of loads below (currently 4) to just 2 or 3.
1985   // Refer to the comments in synchronizer.cpp.
1986   // In practice the chain of fetches doesn't seem to impact performance, however.
1987   xorptr(boxReg, boxReg);
1988   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1989   jccb  (Assembler::notZero, DONE_LABEL);
1990   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1991   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1992   jccb  (Assembler::notZero, CheckSucc);
1993   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1994   jmpb  (DONE_LABEL);
1995 
1996   bind (Stacked);
1997   // It's not inflated and it's not recursively stack-locked and it's not biased.
1998   // It must be stack-locked.
1999   // Try to reset the header to displaced header.
2000   // The "box" value on the stack is stable, so we can reload
2001   // and be assured we observe the same value as above.
2002   movptr(tmpReg, Address(boxReg, 0));
2003   if (os::is_MP()) {
2004     lock();
2005   }
2006   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
2007   // Intention fall-thru into DONE_LABEL
2008 
2009   // DONE_LABEL is a hot target - we'd really like to place it at the
2010   // start of cache line by padding with NOPs.
2011   // See the AMD and Intel software optimization manuals for the
2012   // most efficient "long" NOP encodings.
2013   // Unfortunately none of our alignment mechanisms suffice.
2014   bind (CheckSucc);
2015 #else // _LP64
2016   // It's inflated
2017   xorptr(boxReg, boxReg);
2018   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2019   jccb  (Assembler::notZero, DONE_LABEL);
2020   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2021   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2022   jccb  (Assembler::notZero, CheckSucc);
2023   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2024   jmpb  (DONE_LABEL);
2025 
2026   // Try to avoid passing control into the slow_path ...
2027   Label LSuccess, LGoSlowPath ;
2028   bind  (CheckSucc);
2029 
2030   // The following optional optimization can be elided if necessary
2031   // Effectively: if (succ == null) goto SlowPath
2032   // The code reduces the window for a race, however,
2033   // and thus benefits performance.
2034   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2035   jccb  (Assembler::zero, LGoSlowPath);
2036 
2037   xorptr(boxReg, boxReg);
2038   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2039   if (os::is_MP()) {
2040     // Memory barrier/fence
2041     // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
2042     // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
2043     // This is faster on Nehalem and AMD Shanghai/Barcelona.
2044     // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2045     // We might also restructure (ST Owner=0;barrier;LD _Succ) to
2046     // (mov box,0; xchgq box, &m->Owner; LD _succ) .
2047     lock(); addl(Address(rsp, 0), 0);
2048   }
2049   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2050   jccb  (Assembler::notZero, LSuccess);
2051 
2052   // Rare inopportune interleaving - race.
2053   // The successor vanished in the small window above.
2054   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
2055   // We need to ensure progress and succession.
2056   // Try to reacquire the lock.
2057   // If that fails then the new owner is responsible for succession and this
2058   // thread needs to take no further action and can exit via the fast path (success).
2059   // If the re-acquire succeeds then pass control into the slow path.
2060   // As implemented, this latter mode is horrible because we generated more
2061   // coherence traffic on the lock *and* artifically extended the critical section
2062   // length while by virtue of passing control into the slow path.
2063 
2064   // box is really RAX -- the following CMPXCHG depends on that binding
2065   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
2066   if (os::is_MP()) { lock(); }
2067   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2068   // There's no successor so we tried to regrab the lock.
2069   // If that didn't work, then another thread grabbed the
2070   // lock so we're done (and exit was a success).
2071   jccb  (Assembler::notEqual, LSuccess);
2072   // Intentional fall-through into slow-path
2073 
2074   bind  (LGoSlowPath);
2075   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2076   jmpb  (DONE_LABEL);
2077 
2078   bind  (LSuccess);
2079   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
2080   jmpb  (DONE_LABEL);
2081 
2082   bind  (Stacked);
2083   movptr(tmpReg, Address (boxReg, 0));      // re-fetch
2084   if (os::is_MP()) { lock(); }
2085   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
2086 
2087 #endif
2088   bind(DONE_LABEL);
2089 }
2090 #endif // COMPILER2
2091 
2092 void MacroAssembler::c2bool(Register x) {
2093   // implements x == 0 ? 0 : 1
2094   // note: must only look at least-significant byte of x
2095   //       since C-style booleans are stored in one byte
2096   //       only! (was bug)
2097   andl(x, 0xFF);
2098   setb(Assembler::notZero, x);
2099 }
2100 
2101 // Wouldn't need if AddressLiteral version had new name
2102 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
2103   Assembler::call(L, rtype);
2104 }
2105 
2106 void MacroAssembler::call(Register entry) {
2107   Assembler::call(entry);
2108 }
2109 
2110 void MacroAssembler::call(AddressLiteral entry) {
2111   if (reachable(entry)) {
2112     Assembler::call_literal(entry.target(), entry.rspec());
2113   } else {
2114     lea(rscratch1, entry);
2115     Assembler::call(rscratch1);
2116   }
2117 }
2118 
2119 void MacroAssembler::ic_call(address entry, jint method_index) {
2120   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
2121   movptr(rax, (intptr_t)Universe::non_oop_word());
2122   call(AddressLiteral(entry, rh));
2123 }
2124 
2125 // Implementation of call_VM versions
2126 
2127 void MacroAssembler::call_VM(Register oop_result,
2128                              address entry_point,
2129                              bool check_exceptions) {
2130   Label C, E;
2131   call(C, relocInfo::none);
2132   jmp(E);
2133 
2134   bind(C);
2135   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
2136   ret(0);
2137 
2138   bind(E);
2139 }
2140 
2141 void MacroAssembler::call_VM(Register oop_result,
2142                              address entry_point,
2143                              Register arg_1,
2144                              bool check_exceptions) {
2145   Label C, E;
2146   call(C, relocInfo::none);
2147   jmp(E);
2148 
2149   bind(C);
2150   pass_arg1(this, arg_1);
2151   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
2152   ret(0);
2153 
2154   bind(E);
2155 }
2156 
2157 void MacroAssembler::call_VM(Register oop_result,
2158                              address entry_point,
2159                              Register arg_1,
2160                              Register arg_2,
2161                              bool check_exceptions) {
2162   Label C, E;
2163   call(C, relocInfo::none);
2164   jmp(E);
2165 
2166   bind(C);
2167 
2168   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2169 
2170   pass_arg2(this, arg_2);
2171   pass_arg1(this, arg_1);
2172   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
2173   ret(0);
2174 
2175   bind(E);
2176 }
2177 
2178 void MacroAssembler::call_VM(Register oop_result,
2179                              address entry_point,
2180                              Register arg_1,
2181                              Register arg_2,
2182                              Register arg_3,
2183                              bool check_exceptions) {
2184   Label C, E;
2185   call(C, relocInfo::none);
2186   jmp(E);
2187 
2188   bind(C);
2189 
2190   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2191   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2192   pass_arg3(this, arg_3);
2193 
2194   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2195   pass_arg2(this, arg_2);
2196 
2197   pass_arg1(this, arg_1);
2198   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
2199   ret(0);
2200 
2201   bind(E);
2202 }
2203 
2204 void MacroAssembler::call_VM(Register oop_result,
2205                              Register last_java_sp,
2206                              address entry_point,
2207                              int number_of_arguments,
2208                              bool check_exceptions) {
2209   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2210   call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2211 }
2212 
2213 void MacroAssembler::call_VM(Register oop_result,
2214                              Register last_java_sp,
2215                              address entry_point,
2216                              Register arg_1,
2217                              bool check_exceptions) {
2218   pass_arg1(this, arg_1);
2219   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2220 }
2221 
2222 void MacroAssembler::call_VM(Register oop_result,
2223                              Register last_java_sp,
2224                              address entry_point,
2225                              Register arg_1,
2226                              Register arg_2,
2227                              bool check_exceptions) {
2228 
2229   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2230   pass_arg2(this, arg_2);
2231   pass_arg1(this, arg_1);
2232   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2233 }
2234 
2235 void MacroAssembler::call_VM(Register oop_result,
2236                              Register last_java_sp,
2237                              address entry_point,
2238                              Register arg_1,
2239                              Register arg_2,
2240                              Register arg_3,
2241                              bool check_exceptions) {
2242   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2243   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2244   pass_arg3(this, arg_3);
2245   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2246   pass_arg2(this, arg_2);
2247   pass_arg1(this, arg_1);
2248   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2249 }
2250 
2251 void MacroAssembler::super_call_VM(Register oop_result,
2252                                    Register last_java_sp,
2253                                    address entry_point,
2254                                    int number_of_arguments,
2255                                    bool check_exceptions) {
2256   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2257   MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2258 }
2259 
2260 void MacroAssembler::super_call_VM(Register oop_result,
2261                                    Register last_java_sp,
2262                                    address entry_point,
2263                                    Register arg_1,
2264                                    bool check_exceptions) {
2265   pass_arg1(this, arg_1);
2266   super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2267 }
2268 
2269 void MacroAssembler::super_call_VM(Register oop_result,
2270                                    Register last_java_sp,
2271                                    address entry_point,
2272                                    Register arg_1,
2273                                    Register arg_2,
2274                                    bool check_exceptions) {
2275 
2276   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2277   pass_arg2(this, arg_2);
2278   pass_arg1(this, arg_1);
2279   super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2280 }
2281 
2282 void MacroAssembler::super_call_VM(Register oop_result,
2283                                    Register last_java_sp,
2284                                    address entry_point,
2285                                    Register arg_1,
2286                                    Register arg_2,
2287                                    Register arg_3,
2288                                    bool check_exceptions) {
2289   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2290   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2291   pass_arg3(this, arg_3);
2292   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2293   pass_arg2(this, arg_2);
2294   pass_arg1(this, arg_1);
2295   super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2296 }
2297 
2298 void MacroAssembler::call_VM_base(Register oop_result,
2299                                   Register java_thread,
2300                                   Register last_java_sp,
2301                                   address  entry_point,
2302                                   int      number_of_arguments,
2303                                   bool     check_exceptions) {
2304   // determine java_thread register
2305   if (!java_thread->is_valid()) {
2306 #ifdef _LP64
2307     java_thread = r15_thread;
2308 #else
2309     java_thread = rdi;
2310     get_thread(java_thread);
2311 #endif // LP64
2312   }
2313   // determine last_java_sp register
2314   if (!last_java_sp->is_valid()) {
2315     last_java_sp = rsp;
2316   }
2317   // debugging support
2318   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
2319   LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
2320 #ifdef ASSERT
2321   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
2322   // r12 is the heapbase.
2323   LP64_ONLY(if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
2324 #endif // ASSERT
2325 
2326   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
2327   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
2328 
2329   // push java thread (becomes first argument of C function)
2330 
2331   NOT_LP64(push(java_thread); number_of_arguments++);
2332   LP64_ONLY(mov(c_rarg0, r15_thread));
2333 
2334   // set last Java frame before call
2335   assert(last_java_sp != rbp, "can't use ebp/rbp");
2336 
2337   // Only interpreter should have to set fp
2338   set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
2339 
2340   // do the call, remove parameters
2341   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
2342 
2343   // restore the thread (cannot use the pushed argument since arguments
2344   // may be overwritten by C code generated by an optimizing compiler);
2345   // however can use the register value directly if it is callee saved.
2346   if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
2347     // rdi & rsi (also r15) are callee saved -> nothing to do
2348 #ifdef ASSERT
2349     guarantee(java_thread != rax, "change this code");
2350     push(rax);
2351     { Label L;
2352       get_thread(rax);
2353       cmpptr(java_thread, rax);
2354       jcc(Assembler::equal, L);
2355       STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
2356       bind(L);
2357     }
2358     pop(rax);
2359 #endif
2360   } else {
2361     get_thread(java_thread);
2362   }
2363   // reset last Java frame
2364   // Only interpreter should have to clear fp
2365   reset_last_Java_frame(java_thread, true);
2366 
2367    // C++ interp handles this in the interpreter
2368   check_and_handle_popframe(java_thread);
2369   check_and_handle_earlyret(java_thread);
2370 
2371   if (check_exceptions) {
2372     // check for pending exceptions (java_thread is set upon return)
2373     cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
2374 #ifndef _LP64
2375     jump_cc(Assembler::notEqual,
2376             RuntimeAddress(StubRoutines::forward_exception_entry()));
2377 #else
2378     // This used to conditionally jump to forward_exception however it is
2379     // possible if we relocate that the branch will not reach. So we must jump
2380     // around so we can always reach
2381 
2382     Label ok;
2383     jcc(Assembler::equal, ok);
2384     jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2385     bind(ok);
2386 #endif // LP64
2387   }
2388 
2389   // get oop result if there is one and reset the value in the thread
2390   if (oop_result->is_valid()) {
2391     get_vm_result(oop_result, java_thread);
2392   }
2393 }
2394 
2395 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
2396 
2397   // Calculate the value for last_Java_sp
2398   // somewhat subtle. call_VM does an intermediate call
2399   // which places a return address on the stack just under the
2400   // stack pointer as the user finsihed with it. This allows
2401   // use to retrieve last_Java_pc from last_Java_sp[-1].
2402   // On 32bit we then have to push additional args on the stack to accomplish
2403   // the actual requested call. On 64bit call_VM only can use register args
2404   // so the only extra space is the return address that call_VM created.
2405   // This hopefully explains the calculations here.
2406 
2407 #ifdef _LP64
2408   // We've pushed one address, correct last_Java_sp
2409   lea(rax, Address(rsp, wordSize));
2410 #else
2411   lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
2412 #endif // LP64
2413 
2414   call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
2415 
2416 }
2417 
2418 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
2419 void MacroAssembler::call_VM_leaf0(address entry_point) {
2420   MacroAssembler::call_VM_leaf_base(entry_point, 0);
2421 }
2422 
2423 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
2424   call_VM_leaf_base(entry_point, number_of_arguments);
2425 }
2426 
2427 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
2428   pass_arg0(this, arg_0);
2429   call_VM_leaf(entry_point, 1);
2430 }
2431 
2432 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2433 
2434   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2435   pass_arg1(this, arg_1);
2436   pass_arg0(this, arg_0);
2437   call_VM_leaf(entry_point, 2);
2438 }
2439 
2440 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2441   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2442   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2443   pass_arg2(this, arg_2);
2444   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2445   pass_arg1(this, arg_1);
2446   pass_arg0(this, arg_0);
2447   call_VM_leaf(entry_point, 3);
2448 }
2449 
2450 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
2451   pass_arg0(this, arg_0);
2452   MacroAssembler::call_VM_leaf_base(entry_point, 1);
2453 }
2454 
2455 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2456 
2457   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2458   pass_arg1(this, arg_1);
2459   pass_arg0(this, arg_0);
2460   MacroAssembler::call_VM_leaf_base(entry_point, 2);
2461 }
2462 
2463 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2464   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2465   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2466   pass_arg2(this, arg_2);
2467   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2468   pass_arg1(this, arg_1);
2469   pass_arg0(this, arg_0);
2470   MacroAssembler::call_VM_leaf_base(entry_point, 3);
2471 }
2472 
2473 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
2474   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
2475   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2476   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2477   pass_arg3(this, arg_3);
2478   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2479   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2480   pass_arg2(this, arg_2);
2481   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2482   pass_arg1(this, arg_1);
2483   pass_arg0(this, arg_0);
2484   MacroAssembler::call_VM_leaf_base(entry_point, 4);
2485 }
2486 
2487 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
2488   movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
2489   movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
2490   verify_oop(oop_result, "broken oop in call_VM_base");
2491 }
2492 
2493 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
2494   movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
2495   movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
2496 }
2497 
2498 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
2499 }
2500 
2501 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
2502 }
2503 
2504 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
2505   if (reachable(src1)) {
2506     cmpl(as_Address(src1), imm);
2507   } else {
2508     lea(rscratch1, src1);
2509     cmpl(Address(rscratch1, 0), imm);
2510   }
2511 }
2512 
2513 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
2514   assert(!src2.is_lval(), "use cmpptr");
2515   if (reachable(src2)) {
2516     cmpl(src1, as_Address(src2));
2517   } else {
2518     lea(rscratch1, src2);
2519     cmpl(src1, Address(rscratch1, 0));
2520   }
2521 }
2522 
2523 void MacroAssembler::cmp32(Register src1, int32_t imm) {
2524   Assembler::cmpl(src1, imm);
2525 }
2526 
2527 void MacroAssembler::cmp32(Register src1, Address src2) {
2528   Assembler::cmpl(src1, src2);
2529 }
2530 
2531 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2532   ucomisd(opr1, opr2);
2533 
2534   Label L;
2535   if (unordered_is_less) {
2536     movl(dst, -1);
2537     jcc(Assembler::parity, L);
2538     jcc(Assembler::below , L);
2539     movl(dst, 0);
2540     jcc(Assembler::equal , L);
2541     increment(dst);
2542   } else { // unordered is greater
2543     movl(dst, 1);
2544     jcc(Assembler::parity, L);
2545     jcc(Assembler::above , L);
2546     movl(dst, 0);
2547     jcc(Assembler::equal , L);
2548     decrementl(dst);
2549   }
2550   bind(L);
2551 }
2552 
2553 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2554   ucomiss(opr1, opr2);
2555 
2556   Label L;
2557   if (unordered_is_less) {
2558     movl(dst, -1);
2559     jcc(Assembler::parity, L);
2560     jcc(Assembler::below , L);
2561     movl(dst, 0);
2562     jcc(Assembler::equal , L);
2563     increment(dst);
2564   } else { // unordered is greater
2565     movl(dst, 1);
2566     jcc(Assembler::parity, L);
2567     jcc(Assembler::above , L);
2568     movl(dst, 0);
2569     jcc(Assembler::equal , L);
2570     decrementl(dst);
2571   }
2572   bind(L);
2573 }
2574 
2575 
2576 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
2577   if (reachable(src1)) {
2578     cmpb(as_Address(src1), imm);
2579   } else {
2580     lea(rscratch1, src1);
2581     cmpb(Address(rscratch1, 0), imm);
2582   }
2583 }
2584 
2585 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
2586 #ifdef _LP64
2587   if (src2.is_lval()) {
2588     movptr(rscratch1, src2);
2589     Assembler::cmpq(src1, rscratch1);
2590   } else if (reachable(src2)) {
2591     cmpq(src1, as_Address(src2));
2592   } else {
2593     lea(rscratch1, src2);
2594     Assembler::cmpq(src1, Address(rscratch1, 0));
2595   }
2596 #else
2597   if (src2.is_lval()) {
2598     cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2599   } else {
2600     cmpl(src1, as_Address(src2));
2601   }
2602 #endif // _LP64
2603 }
2604 
2605 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
2606   assert(src2.is_lval(), "not a mem-mem compare");
2607 #ifdef _LP64
2608   // moves src2's literal address
2609   movptr(rscratch1, src2);
2610   Assembler::cmpq(src1, rscratch1);
2611 #else
2612   cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2613 #endif // _LP64
2614 }
2615 
2616 void MacroAssembler::cmpoop(Register src1, Register src2) {
2617   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2618   bs->obj_equals(this, src1, src2);
2619 }
2620 
2621 void MacroAssembler::cmpoop(Register src1, Address src2) {
2622   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2623   bs->obj_equals(this, src1, src2);
2624 }
2625 
2626 #ifdef _LP64
2627 void MacroAssembler::cmpoop(Register src1, jobject src2) {
2628   movoop(rscratch1, src2);
2629   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2630   bs->obj_equals(this, src1, rscratch1);
2631 }
2632 #endif
2633 
2634 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
2635   if (reachable(adr)) {
2636     if (os::is_MP())
2637       lock();
2638     cmpxchgptr(reg, as_Address(adr));
2639   } else {
2640     lea(rscratch1, adr);
2641     if (os::is_MP())
2642       lock();
2643     cmpxchgptr(reg, Address(rscratch1, 0));
2644   }
2645 }
2646 
2647 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
2648   LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
2649 }
2650 
2651 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
2652   if (reachable(src)) {
2653     Assembler::comisd(dst, as_Address(src));
2654   } else {
2655     lea(rscratch1, src);
2656     Assembler::comisd(dst, Address(rscratch1, 0));
2657   }
2658 }
2659 
2660 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
2661   if (reachable(src)) {
2662     Assembler::comiss(dst, as_Address(src));
2663   } else {
2664     lea(rscratch1, src);
2665     Assembler::comiss(dst, Address(rscratch1, 0));
2666   }
2667 }
2668 
2669 
2670 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
2671   Condition negated_cond = negate_condition(cond);
2672   Label L;
2673   jcc(negated_cond, L);
2674   pushf(); // Preserve flags
2675   atomic_incl(counter_addr);
2676   popf();
2677   bind(L);
2678 }
2679 
2680 int MacroAssembler::corrected_idivl(Register reg) {
2681   // Full implementation of Java idiv and irem; checks for
2682   // special case as described in JVM spec., p.243 & p.271.
2683   // The function returns the (pc) offset of the idivl
2684   // instruction - may be needed for implicit exceptions.
2685   //
2686   //         normal case                           special case
2687   //
2688   // input : rax,: dividend                         min_int
2689   //         reg: divisor   (may not be rax,/rdx)   -1
2690   //
2691   // output: rax,: quotient  (= rax, idiv reg)       min_int
2692   //         rdx: remainder (= rax, irem reg)       0
2693   assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
2694   const int min_int = 0x80000000;
2695   Label normal_case, special_case;
2696 
2697   // check for special case
2698   cmpl(rax, min_int);
2699   jcc(Assembler::notEqual, normal_case);
2700   xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
2701   cmpl(reg, -1);
2702   jcc(Assembler::equal, special_case);
2703 
2704   // handle normal case
2705   bind(normal_case);
2706   cdql();
2707   int idivl_offset = offset();
2708   idivl(reg);
2709 
2710   // normal and special case exit
2711   bind(special_case);
2712 
2713   return idivl_offset;
2714 }
2715 
2716 
2717 
2718 void MacroAssembler::decrementl(Register reg, int value) {
2719   if (value == min_jint) {subl(reg, value) ; return; }
2720   if (value <  0) { incrementl(reg, -value); return; }
2721   if (value == 0) {                        ; return; }
2722   if (value == 1 && UseIncDec) { decl(reg) ; return; }
2723   /* else */      { subl(reg, value)       ; return; }
2724 }
2725 
2726 void MacroAssembler::decrementl(Address dst, int value) {
2727   if (value == min_jint) {subl(dst, value) ; return; }
2728   if (value <  0) { incrementl(dst, -value); return; }
2729   if (value == 0) {                        ; return; }
2730   if (value == 1 && UseIncDec) { decl(dst) ; return; }
2731   /* else */      { subl(dst, value)       ; return; }
2732 }
2733 
2734 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
2735   assert (shift_value > 0, "illegal shift value");
2736   Label _is_positive;
2737   testl (reg, reg);
2738   jcc (Assembler::positive, _is_positive);
2739   int offset = (1 << shift_value) - 1 ;
2740 
2741   if (offset == 1) {
2742     incrementl(reg);
2743   } else {
2744     addl(reg, offset);
2745   }
2746 
2747   bind (_is_positive);
2748   sarl(reg, shift_value);
2749 }
2750 
2751 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
2752   if (reachable(src)) {
2753     Assembler::divsd(dst, as_Address(src));
2754   } else {
2755     lea(rscratch1, src);
2756     Assembler::divsd(dst, Address(rscratch1, 0));
2757   }
2758 }
2759 
2760 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
2761   if (reachable(src)) {
2762     Assembler::divss(dst, as_Address(src));
2763   } else {
2764     lea(rscratch1, src);
2765     Assembler::divss(dst, Address(rscratch1, 0));
2766   }
2767 }
2768 
2769 // !defined(COMPILER2) is because of stupid core builds
2770 #if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2) || INCLUDE_JVMCI
2771 void MacroAssembler::empty_FPU_stack() {
2772   if (VM_Version::supports_mmx()) {
2773     emms();
2774   } else {
2775     for (int i = 8; i-- > 0; ) ffree(i);
2776   }
2777 }
2778 #endif // !LP64 || C1 || !C2 || INCLUDE_JVMCI
2779 
2780 
2781 void MacroAssembler::enter() {
2782   push(rbp);
2783   mov(rbp, rsp);
2784 }
2785 
2786 // A 5 byte nop that is safe for patching (see patch_verified_entry)
2787 void MacroAssembler::fat_nop() {
2788   if (UseAddressNop) {
2789     addr_nop_5();
2790   } else {
2791     emit_int8(0x26); // es:
2792     emit_int8(0x2e); // cs:
2793     emit_int8(0x64); // fs:
2794     emit_int8(0x65); // gs:
2795     emit_int8((unsigned char)0x90);
2796   }
2797 }
2798 
2799 void MacroAssembler::fcmp(Register tmp) {
2800   fcmp(tmp, 1, true, true);
2801 }
2802 
2803 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
2804   assert(!pop_right || pop_left, "usage error");
2805   if (VM_Version::supports_cmov()) {
2806     assert(tmp == noreg, "unneeded temp");
2807     if (pop_left) {
2808       fucomip(index);
2809     } else {
2810       fucomi(index);
2811     }
2812     if (pop_right) {
2813       fpop();
2814     }
2815   } else {
2816     assert(tmp != noreg, "need temp");
2817     if (pop_left) {
2818       if (pop_right) {
2819         fcompp();
2820       } else {
2821         fcomp(index);
2822       }
2823     } else {
2824       fcom(index);
2825     }
2826     // convert FPU condition into eflags condition via rax,
2827     save_rax(tmp);
2828     fwait(); fnstsw_ax();
2829     sahf();
2830     restore_rax(tmp);
2831   }
2832   // condition codes set as follows:
2833   //
2834   // CF (corresponds to C0) if x < y
2835   // PF (corresponds to C2) if unordered
2836   // ZF (corresponds to C3) if x = y
2837 }
2838 
2839 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
2840   fcmp2int(dst, unordered_is_less, 1, true, true);
2841 }
2842 
2843 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
2844   fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
2845   Label L;
2846   if (unordered_is_less) {
2847     movl(dst, -1);
2848     jcc(Assembler::parity, L);
2849     jcc(Assembler::below , L);
2850     movl(dst, 0);
2851     jcc(Assembler::equal , L);
2852     increment(dst);
2853   } else { // unordered is greater
2854     movl(dst, 1);
2855     jcc(Assembler::parity, L);
2856     jcc(Assembler::above , L);
2857     movl(dst, 0);
2858     jcc(Assembler::equal , L);
2859     decrementl(dst);
2860   }
2861   bind(L);
2862 }
2863 
2864 void MacroAssembler::fld_d(AddressLiteral src) {
2865   fld_d(as_Address(src));
2866 }
2867 
2868 void MacroAssembler::fld_s(AddressLiteral src) {
2869   fld_s(as_Address(src));
2870 }
2871 
2872 void MacroAssembler::fld_x(AddressLiteral src) {
2873   Assembler::fld_x(as_Address(src));
2874 }
2875 
2876 void MacroAssembler::fldcw(AddressLiteral src) {
2877   Assembler::fldcw(as_Address(src));
2878 }
2879 
2880 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
2881   if (reachable(src)) {
2882     Assembler::mulpd(dst, as_Address(src));
2883   } else {
2884     lea(rscratch1, src);
2885     Assembler::mulpd(dst, Address(rscratch1, 0));
2886   }
2887 }
2888 
2889 void MacroAssembler::increase_precision() {
2890   subptr(rsp, BytesPerWord);
2891   fnstcw(Address(rsp, 0));
2892   movl(rax, Address(rsp, 0));
2893   orl(rax, 0x300);
2894   push(rax);
2895   fldcw(Address(rsp, 0));
2896   pop(rax);
2897 }
2898 
2899 void MacroAssembler::restore_precision() {
2900   fldcw(Address(rsp, 0));
2901   addptr(rsp, BytesPerWord);
2902 }
2903 
2904 void MacroAssembler::fpop() {
2905   ffree();
2906   fincstp();
2907 }
2908 
2909 void MacroAssembler::load_float(Address src) {
2910   if (UseSSE >= 1) {
2911     movflt(xmm0, src);
2912   } else {
2913     LP64_ONLY(ShouldNotReachHere());
2914     NOT_LP64(fld_s(src));
2915   }
2916 }
2917 
2918 void MacroAssembler::store_float(Address dst) {
2919   if (UseSSE >= 1) {
2920     movflt(dst, xmm0);
2921   } else {
2922     LP64_ONLY(ShouldNotReachHere());
2923     NOT_LP64(fstp_s(dst));
2924   }
2925 }
2926 
2927 void MacroAssembler::load_double(Address src) {
2928   if (UseSSE >= 2) {
2929     movdbl(xmm0, src);
2930   } else {
2931     LP64_ONLY(ShouldNotReachHere());
2932     NOT_LP64(fld_d(src));
2933   }
2934 }
2935 
2936 void MacroAssembler::store_double(Address dst) {
2937   if (UseSSE >= 2) {
2938     movdbl(dst, xmm0);
2939   } else {
2940     LP64_ONLY(ShouldNotReachHere());
2941     NOT_LP64(fstp_d(dst));
2942   }
2943 }
2944 
2945 void MacroAssembler::push_zmm(XMMRegister reg) {
2946   lea(rsp, Address(rsp, -64)); // Use lea to not affect flags
2947   evmovdqul(Address(rsp, 0), reg, Assembler::AVX_512bit);
2948 }
2949 
2950 void MacroAssembler::pop_zmm(XMMRegister reg) {
2951   evmovdqul(reg, Address(rsp, 0), Assembler::AVX_512bit);
2952   lea(rsp, Address(rsp, 64)); // Use lea to not affect flags
2953 }
2954 
2955 void MacroAssembler::fremr(Register tmp) {
2956   save_rax(tmp);
2957   { Label L;
2958     bind(L);
2959     fprem();
2960     fwait(); fnstsw_ax();
2961 #ifdef _LP64
2962     testl(rax, 0x400);
2963     jcc(Assembler::notEqual, L);
2964 #else
2965     sahf();
2966     jcc(Assembler::parity, L);
2967 #endif // _LP64
2968   }
2969   restore_rax(tmp);
2970   // Result is in ST0.
2971   // Note: fxch & fpop to get rid of ST1
2972   // (otherwise FPU stack could overflow eventually)
2973   fxch(1);
2974   fpop();
2975 }
2976 
2977 // dst = c = a * b + c
2978 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2979   Assembler::vfmadd231sd(c, a, b);
2980   if (dst != c) {
2981     movdbl(dst, c);
2982   }
2983 }
2984 
2985 // dst = c = a * b + c
2986 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2987   Assembler::vfmadd231ss(c, a, b);
2988   if (dst != c) {
2989     movflt(dst, c);
2990   }
2991 }
2992 
2993 // dst = c = a * b + c
2994 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2995   Assembler::vfmadd231pd(c, a, b, vector_len);
2996   if (dst != c) {
2997     vmovdqu(dst, c);
2998   }
2999 }
3000 
3001 // dst = c = a * b + c
3002 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
3003   Assembler::vfmadd231ps(c, a, b, vector_len);
3004   if (dst != c) {
3005     vmovdqu(dst, c);
3006   }
3007 }
3008 
3009 // dst = c = a * b + c
3010 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
3011   Assembler::vfmadd231pd(c, a, b, vector_len);
3012   if (dst != c) {
3013     vmovdqu(dst, c);
3014   }
3015 }
3016 
3017 // dst = c = a * b + c
3018 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
3019   Assembler::vfmadd231ps(c, a, b, vector_len);
3020   if (dst != c) {
3021     vmovdqu(dst, c);
3022   }
3023 }
3024 
3025 void MacroAssembler::incrementl(AddressLiteral dst) {
3026   if (reachable(dst)) {
3027     incrementl(as_Address(dst));
3028   } else {
3029     lea(rscratch1, dst);
3030     incrementl(Address(rscratch1, 0));
3031   }
3032 }
3033 
3034 void MacroAssembler::incrementl(ArrayAddress dst) {
3035   incrementl(as_Address(dst));
3036 }
3037 
3038 void MacroAssembler::incrementl(Register reg, int value) {
3039   if (value == min_jint) {addl(reg, value) ; return; }
3040   if (value <  0) { decrementl(reg, -value); return; }
3041   if (value == 0) {                        ; return; }
3042   if (value == 1 && UseIncDec) { incl(reg) ; return; }
3043   /* else */      { addl(reg, value)       ; return; }
3044 }
3045 
3046 void MacroAssembler::incrementl(Address dst, int value) {
3047   if (value == min_jint) {addl(dst, value) ; return; }
3048   if (value <  0) { decrementl(dst, -value); return; }
3049   if (value == 0) {                        ; return; }
3050   if (value == 1 && UseIncDec) { incl(dst) ; return; }
3051   /* else */      { addl(dst, value)       ; return; }
3052 }
3053 
3054 void MacroAssembler::jump(AddressLiteral dst) {
3055   if (reachable(dst)) {
3056     jmp_literal(dst.target(), dst.rspec());
3057   } else {
3058     lea(rscratch1, dst);
3059     jmp(rscratch1);
3060   }
3061 }
3062 
3063 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
3064   if (reachable(dst)) {
3065     InstructionMark im(this);
3066     relocate(dst.reloc());
3067     const int short_size = 2;
3068     const int long_size = 6;
3069     int offs = (intptr_t)dst.target() - ((intptr_t)pc());
3070     if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
3071       // 0111 tttn #8-bit disp
3072       emit_int8(0x70 | cc);
3073       emit_int8((offs - short_size) & 0xFF);
3074     } else {
3075       // 0000 1111 1000 tttn #32-bit disp
3076       emit_int8(0x0F);
3077       emit_int8((unsigned char)(0x80 | cc));
3078       emit_int32(offs - long_size);
3079     }
3080   } else {
3081 #ifdef ASSERT
3082     warning("reversing conditional branch");
3083 #endif /* ASSERT */
3084     Label skip;
3085     jccb(reverse[cc], skip);
3086     lea(rscratch1, dst);
3087     Assembler::jmp(rscratch1);
3088     bind(skip);
3089   }
3090 }
3091 
3092 void MacroAssembler::ldmxcsr(AddressLiteral src) {
3093   if (reachable(src)) {
3094     Assembler::ldmxcsr(as_Address(src));
3095   } else {
3096     lea(rscratch1, src);
3097     Assembler::ldmxcsr(Address(rscratch1, 0));
3098   }
3099 }
3100 
3101 int MacroAssembler::load_signed_byte(Register dst, Address src) {
3102   int off;
3103   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3104     off = offset();
3105     movsbl(dst, src); // movsxb
3106   } else {
3107     off = load_unsigned_byte(dst, src);
3108     shll(dst, 24);
3109     sarl(dst, 24);
3110   }
3111   return off;
3112 }
3113 
3114 // Note: load_signed_short used to be called load_signed_word.
3115 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
3116 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
3117 // The term "word" in HotSpot means a 32- or 64-bit machine word.
3118 int MacroAssembler::load_signed_short(Register dst, Address src) {
3119   int off;
3120   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3121     // This is dubious to me since it seems safe to do a signed 16 => 64 bit
3122     // version but this is what 64bit has always done. This seems to imply
3123     // that users are only using 32bits worth.
3124     off = offset();
3125     movswl(dst, src); // movsxw
3126   } else {
3127     off = load_unsigned_short(dst, src);
3128     shll(dst, 16);
3129     sarl(dst, 16);
3130   }
3131   return off;
3132 }
3133 
3134 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
3135   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3136   // and "3.9 Partial Register Penalties", p. 22).
3137   int off;
3138   if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
3139     off = offset();
3140     movzbl(dst, src); // movzxb
3141   } else {
3142     xorl(dst, dst);
3143     off = offset();
3144     movb(dst, src);
3145   }
3146   return off;
3147 }
3148 
3149 // Note: load_unsigned_short used to be called load_unsigned_word.
3150 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
3151   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3152   // and "3.9 Partial Register Penalties", p. 22).
3153   int off;
3154   if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
3155     off = offset();
3156     movzwl(dst, src); // movzxw
3157   } else {
3158     xorl(dst, dst);
3159     off = offset();
3160     movw(dst, src);
3161   }
3162   return off;
3163 }
3164 
3165 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
3166   switch (size_in_bytes) {
3167 #ifndef _LP64
3168   case  8:
3169     assert(dst2 != noreg, "second dest register required");
3170     movl(dst,  src);
3171     movl(dst2, src.plus_disp(BytesPerInt));
3172     break;
3173 #else
3174   case  8:  movq(dst, src); break;
3175 #endif
3176   case  4:  movl(dst, src); break;
3177   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
3178   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
3179   default:  ShouldNotReachHere();
3180   }
3181 }
3182 
3183 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
3184   switch (size_in_bytes) {
3185 #ifndef _LP64
3186   case  8:
3187     assert(src2 != noreg, "second source register required");
3188     movl(dst,                        src);
3189     movl(dst.plus_disp(BytesPerInt), src2);
3190     break;
3191 #else
3192   case  8:  movq(dst, src); break;
3193 #endif
3194   case  4:  movl(dst, src); break;
3195   case  2:  movw(dst, src); break;
3196   case  1:  movb(dst, src); break;
3197   default:  ShouldNotReachHere();
3198   }
3199 }
3200 
3201 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
3202   if (reachable(dst)) {
3203     movl(as_Address(dst), src);
3204   } else {
3205     lea(rscratch1, dst);
3206     movl(Address(rscratch1, 0), src);
3207   }
3208 }
3209 
3210 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
3211   if (reachable(src)) {
3212     movl(dst, as_Address(src));
3213   } else {
3214     lea(rscratch1, src);
3215     movl(dst, Address(rscratch1, 0));
3216   }
3217 }
3218 
3219 // C++ bool manipulation
3220 
3221 void MacroAssembler::movbool(Register dst, Address src) {
3222   if(sizeof(bool) == 1)
3223     movb(dst, src);
3224   else if(sizeof(bool) == 2)
3225     movw(dst, src);
3226   else if(sizeof(bool) == 4)
3227     movl(dst, src);
3228   else
3229     // unsupported
3230     ShouldNotReachHere();
3231 }
3232 
3233 void MacroAssembler::movbool(Address dst, bool boolconst) {
3234   if(sizeof(bool) == 1)
3235     movb(dst, (int) boolconst);
3236   else if(sizeof(bool) == 2)
3237     movw(dst, (int) boolconst);
3238   else if(sizeof(bool) == 4)
3239     movl(dst, (int) boolconst);
3240   else
3241     // unsupported
3242     ShouldNotReachHere();
3243 }
3244 
3245 void MacroAssembler::movbool(Address dst, Register src) {
3246   if(sizeof(bool) == 1)
3247     movb(dst, src);
3248   else if(sizeof(bool) == 2)
3249     movw(dst, src);
3250   else if(sizeof(bool) == 4)
3251     movl(dst, src);
3252   else
3253     // unsupported
3254     ShouldNotReachHere();
3255 }
3256 
3257 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
3258   movb(as_Address(dst), src);
3259 }
3260 
3261 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
3262   if (reachable(src)) {
3263     movdl(dst, as_Address(src));
3264   } else {
3265     lea(rscratch1, src);
3266     movdl(dst, Address(rscratch1, 0));
3267   }
3268 }
3269 
3270 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
3271   if (reachable(src)) {
3272     movq(dst, as_Address(src));
3273   } else {
3274     lea(rscratch1, src);
3275     movq(dst, Address(rscratch1, 0));
3276   }
3277 }
3278 
3279 void MacroAssembler::setvectmask(Register dst, Register src) {
3280   Assembler::movl(dst, 1);
3281   Assembler::shlxl(dst, dst, src);
3282   Assembler::decl(dst);
3283   Assembler::kmovdl(k1, dst);
3284   Assembler::movl(dst, src);
3285 }
3286 
3287 void MacroAssembler::restorevectmask() {
3288   Assembler::knotwl(k1, k0);
3289 }
3290 
3291 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
3292   if (reachable(src)) {
3293     if (UseXmmLoadAndClearUpper) {
3294       movsd (dst, as_Address(src));
3295     } else {
3296       movlpd(dst, as_Address(src));
3297     }
3298   } else {
3299     lea(rscratch1, src);
3300     if (UseXmmLoadAndClearUpper) {
3301       movsd (dst, Address(rscratch1, 0));
3302     } else {
3303       movlpd(dst, Address(rscratch1, 0));
3304     }
3305   }
3306 }
3307 
3308 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
3309   if (reachable(src)) {
3310     movss(dst, as_Address(src));
3311   } else {
3312     lea(rscratch1, src);
3313     movss(dst, Address(rscratch1, 0));
3314   }
3315 }
3316 
3317 void MacroAssembler::movptr(Register dst, Register src) {
3318   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3319 }
3320 
3321 void MacroAssembler::movptr(Register dst, Address src) {
3322   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3323 }
3324 
3325 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
3326 void MacroAssembler::movptr(Register dst, intptr_t src) {
3327   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
3328 }
3329 
3330 void MacroAssembler::movptr(Address dst, Register src) {
3331   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3332 }
3333 
3334 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
3335   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
3336     Assembler::vextractf32x4(dst, src, 0);
3337   } else {
3338     Assembler::movdqu(dst, src);
3339   }
3340 }
3341 
3342 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
3343   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
3344     Assembler::vinsertf32x4(dst, dst, src, 0);
3345   } else {
3346     Assembler::movdqu(dst, src);
3347   }
3348 }
3349 
3350 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
3351   if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3352     Assembler::evmovdqul(dst, src, Assembler::AVX_512bit);
3353   } else {
3354     Assembler::movdqu(dst, src);
3355   }
3356 }
3357 
3358 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
3359   if (reachable(src)) {
3360     movdqu(dst, as_Address(src));
3361   } else {
3362     lea(scratchReg, src);
3363     movdqu(dst, Address(scratchReg, 0));
3364   }
3365 }
3366 
3367 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
3368   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
3369     vextractf64x4_low(dst, src);
3370   } else {
3371     Assembler::vmovdqu(dst, src);
3372   }
3373 }
3374 
3375 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
3376   if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
3377     vinsertf64x4_low(dst, src);
3378   } else {
3379     Assembler::vmovdqu(dst, src);
3380   }
3381 }
3382 
3383 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
3384   if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3385     Assembler::evmovdqul(dst, src, Assembler::AVX_512bit);
3386   }
3387   else {
3388     Assembler::vmovdqu(dst, src);
3389   }
3390 }
3391 
3392 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src) {
3393   if (reachable(src)) {
3394     vmovdqu(dst, as_Address(src));
3395   }
3396   else {
3397     lea(rscratch1, src);
3398     vmovdqu(dst, Address(rscratch1, 0));
3399   }
3400 }
3401 
3402 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3403   if (reachable(src)) {
3404     Assembler::evmovdquq(dst, as_Address(src), vector_len);
3405   } else {
3406     lea(rscratch, src);
3407     Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
3408   }
3409 }
3410 
3411 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
3412   if (reachable(src)) {
3413     Assembler::movdqa(dst, as_Address(src));
3414   } else {
3415     lea(rscratch1, src);
3416     Assembler::movdqa(dst, Address(rscratch1, 0));
3417   }
3418 }
3419 
3420 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
3421   if (reachable(src)) {
3422     Assembler::movsd(dst, as_Address(src));
3423   } else {
3424     lea(rscratch1, src);
3425     Assembler::movsd(dst, Address(rscratch1, 0));
3426   }
3427 }
3428 
3429 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
3430   if (reachable(src)) {
3431     Assembler::movss(dst, as_Address(src));
3432   } else {
3433     lea(rscratch1, src);
3434     Assembler::movss(dst, Address(rscratch1, 0));
3435   }
3436 }
3437 
3438 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
3439   if (reachable(src)) {
3440     Assembler::mulsd(dst, as_Address(src));
3441   } else {
3442     lea(rscratch1, src);
3443     Assembler::mulsd(dst, Address(rscratch1, 0));
3444   }
3445 }
3446 
3447 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
3448   if (reachable(src)) {
3449     Assembler::mulss(dst, as_Address(src));
3450   } else {
3451     lea(rscratch1, src);
3452     Assembler::mulss(dst, Address(rscratch1, 0));
3453   }
3454 }
3455 
3456 void MacroAssembler::null_check(Register reg, int offset) {
3457   if (needs_explicit_null_check(offset)) {
3458     // provoke OS NULL exception if reg = NULL by
3459     // accessing M[reg] w/o changing any (non-CC) registers
3460     // NOTE: cmpl is plenty here to provoke a segv
3461     cmpptr(rax, Address(reg, 0));
3462     // Note: should probably use testl(rax, Address(reg, 0));
3463     //       may be shorter code (however, this version of
3464     //       testl needs to be implemented first)
3465   } else {
3466     // nothing to do, (later) access of M[reg + offset]
3467     // will provoke OS NULL exception if reg = NULL
3468   }
3469 }
3470 
3471 void MacroAssembler::os_breakpoint() {
3472   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
3473   // (e.g., MSVC can't call ps() otherwise)
3474   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
3475 }
3476 
3477 void MacroAssembler::unimplemented(const char* what) {
3478   const char* buf = NULL;
3479   {
3480     ResourceMark rm;
3481     stringStream ss;
3482     ss.print("unimplemented: %s", what);
3483     buf = code_string(ss.as_string());
3484   }
3485   stop(buf);
3486 }
3487 
3488 #ifdef _LP64
3489 #define XSTATE_BV 0x200
3490 #endif
3491 
3492 void MacroAssembler::pop_CPU_state() {
3493   pop_FPU_state();
3494   pop_IU_state();
3495 }
3496 
3497 void MacroAssembler::pop_FPU_state() {
3498 #ifndef _LP64
3499   frstor(Address(rsp, 0));
3500 #else
3501   fxrstor(Address(rsp, 0));
3502 #endif
3503   addptr(rsp, FPUStateSizeInWords * wordSize);
3504 }
3505 
3506 void MacroAssembler::pop_IU_state() {
3507   popa();
3508   LP64_ONLY(addq(rsp, 8));
3509   popf();
3510 }
3511 
3512 // Save Integer and Float state
3513 // Warning: Stack must be 16 byte aligned (64bit)
3514 void MacroAssembler::push_CPU_state() {
3515   push_IU_state();
3516   push_FPU_state();
3517 }
3518 
3519 void MacroAssembler::push_FPU_state() {
3520   subptr(rsp, FPUStateSizeInWords * wordSize);
3521 #ifndef _LP64
3522   fnsave(Address(rsp, 0));
3523   fwait();
3524 #else
3525   fxsave(Address(rsp, 0));
3526 #endif // LP64
3527 }
3528 
3529 void MacroAssembler::push_IU_state() {
3530   // Push flags first because pusha kills them
3531   pushf();
3532   // Make sure rsp stays 16-byte aligned
3533   LP64_ONLY(subq(rsp, 8));
3534   pusha();
3535 }
3536 
3537 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register
3538   if (!java_thread->is_valid()) {
3539     java_thread = rdi;
3540     get_thread(java_thread);
3541   }
3542   // we must set sp to zero to clear frame
3543   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
3544   if (clear_fp) {
3545     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3546   }
3547 
3548   // Always clear the pc because it could have been set by make_walkable()
3549   movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
3550 
3551   vzeroupper();
3552 }
3553 
3554 void MacroAssembler::restore_rax(Register tmp) {
3555   if (tmp == noreg) pop(rax);
3556   else if (tmp != rax) mov(rax, tmp);
3557 }
3558 
3559 void MacroAssembler::round_to(Register reg, int modulus) {
3560   addptr(reg, modulus - 1);
3561   andptr(reg, -modulus);
3562 }
3563 
3564 void MacroAssembler::save_rax(Register tmp) {
3565   if (tmp == noreg) push(rax);
3566   else if (tmp != rax) mov(tmp, rax);
3567 }
3568 
3569 // Write serialization page so VM thread can do a pseudo remote membar.
3570 // We use the current thread pointer to calculate a thread specific
3571 // offset to write to within the page. This minimizes bus traffic
3572 // due to cache line collision.
3573 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
3574   movl(tmp, thread);
3575   shrl(tmp, os::get_serialize_page_shift_count());
3576   andl(tmp, (os::vm_page_size() - sizeof(int)));
3577 
3578   Address index(noreg, tmp, Address::times_1);
3579   ExternalAddress page(os::get_memory_serialize_page());
3580 
3581   // Size of store must match masking code above
3582   movl(as_Address(ArrayAddress(page, index)), tmp);
3583 }
3584 
3585 void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, Register temp_reg) {
3586   if (SafepointMechanism::uses_thread_local_poll()) {
3587 #ifdef _LP64
3588     assert(thread_reg == r15_thread, "should be");
3589 #else
3590     if (thread_reg == noreg) {
3591       thread_reg = temp_reg;
3592       get_thread(thread_reg);
3593     }
3594 #endif
3595     testb(Address(thread_reg, Thread::polling_page_offset()), SafepointMechanism::poll_bit());
3596     jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
3597   } else {
3598     cmp32(ExternalAddress(SafepointSynchronize::address_of_state()),
3599         SafepointSynchronize::_not_synchronized);
3600     jcc(Assembler::notEqual, slow_path);
3601   }
3602 }
3603 
3604 // Calls to C land
3605 //
3606 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
3607 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
3608 // has to be reset to 0. This is required to allow proper stack traversal.
3609 void MacroAssembler::set_last_Java_frame(Register java_thread,
3610                                          Register last_java_sp,
3611                                          Register last_java_fp,
3612                                          address  last_java_pc) {
3613   vzeroupper();
3614   // determine java_thread register
3615   if (!java_thread->is_valid()) {
3616     java_thread = rdi;
3617     get_thread(java_thread);
3618   }
3619   // determine last_java_sp register
3620   if (!last_java_sp->is_valid()) {
3621     last_java_sp = rsp;
3622   }
3623 
3624   // last_java_fp is optional
3625 
3626   if (last_java_fp->is_valid()) {
3627     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
3628   }
3629 
3630   // last_java_pc is optional
3631 
3632   if (last_java_pc != NULL) {
3633     lea(Address(java_thread,
3634                  JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
3635         InternalAddress(last_java_pc));
3636 
3637   }
3638   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
3639 }
3640 
3641 void MacroAssembler::shlptr(Register dst, int imm8) {
3642   LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
3643 }
3644 
3645 void MacroAssembler::shrptr(Register dst, int imm8) {
3646   LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
3647 }
3648 
3649 void MacroAssembler::sign_extend_byte(Register reg) {
3650   if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
3651     movsbl(reg, reg); // movsxb
3652   } else {
3653     shll(reg, 24);
3654     sarl(reg, 24);
3655   }
3656 }
3657 
3658 void MacroAssembler::sign_extend_short(Register reg) {
3659   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3660     movswl(reg, reg); // movsxw
3661   } else {
3662     shll(reg, 16);
3663     sarl(reg, 16);
3664   }
3665 }
3666 
3667 void MacroAssembler::testl(Register dst, AddressLiteral src) {
3668   assert(reachable(src), "Address should be reachable");
3669   testl(dst, as_Address(src));
3670 }
3671 
3672 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
3673   int dst_enc = dst->encoding();
3674   int src_enc = src->encoding();
3675   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
3676     Assembler::pcmpeqb(dst, src);
3677   } else if ((dst_enc < 16) && (src_enc < 16)) {
3678     Assembler::pcmpeqb(dst, src);
3679   } else if (src_enc < 16) {
3680     push_zmm(xmm0);
3681     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3682     Assembler::pcmpeqb(xmm0, src);
3683     movdqu(dst, xmm0);
3684     pop_zmm(xmm0);
3685   } else if (dst_enc < 16) {
3686     push_zmm(xmm0);
3687     evmovdqul(xmm0, src, Assembler::AVX_512bit);
3688     Assembler::pcmpeqb(dst, xmm0);
3689     pop_zmm(xmm0);
3690   } else {
3691     push_zmm(xmm0);
3692     push_zmm(xmm1);
3693     movdqu(xmm0, src);
3694     movdqu(xmm1, dst);
3695     Assembler::pcmpeqb(xmm1, xmm0);
3696     movdqu(dst, xmm1);
3697     pop_zmm(xmm1);
3698     pop_zmm(xmm0);
3699   }
3700 }
3701 
3702 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
3703   int dst_enc = dst->encoding();
3704   int src_enc = src->encoding();
3705   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
3706     Assembler::pcmpeqw(dst, src);
3707   } else if ((dst_enc < 16) && (src_enc < 16)) {
3708     Assembler::pcmpeqw(dst, src);
3709   } else if (src_enc < 16) {
3710     push_zmm(xmm0);
3711     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3712     Assembler::pcmpeqw(xmm0, src);
3713     movdqu(dst, xmm0);
3714     pop_zmm(xmm0);
3715   } else if (dst_enc < 16) {
3716     push_zmm(xmm0);
3717     evmovdqul(xmm0, src, Assembler::AVX_512bit);
3718     Assembler::pcmpeqw(dst, xmm0);
3719     pop_zmm(xmm0);
3720   } else {
3721     push_zmm(xmm0);
3722     push_zmm(xmm1);
3723     movdqu(xmm0, src);
3724     movdqu(xmm1, dst);
3725     Assembler::pcmpeqw(xmm1, xmm0);
3726     movdqu(dst, xmm1);
3727     pop_zmm(xmm1);
3728     pop_zmm(xmm0);
3729   }
3730 }
3731 
3732 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
3733   int dst_enc = dst->encoding();
3734   if (dst_enc < 16) {
3735     Assembler::pcmpestri(dst, src, imm8);
3736   } else {
3737     push_zmm(xmm0);
3738     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3739     Assembler::pcmpestri(xmm0, src, imm8);
3740     movdqu(dst, xmm0);
3741     pop_zmm(xmm0);
3742   }
3743 }
3744 
3745 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
3746   int dst_enc = dst->encoding();
3747   int src_enc = src->encoding();
3748   if ((dst_enc < 16) && (src_enc < 16)) {
3749     Assembler::pcmpestri(dst, src, imm8);
3750   } else if (src_enc < 16) {
3751     push_zmm(xmm0);
3752     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3753     Assembler::pcmpestri(xmm0, src, imm8);
3754     movdqu(dst, xmm0);
3755     pop_zmm(xmm0);
3756   } else if (dst_enc < 16) {
3757     push_zmm(xmm0);
3758     evmovdqul(xmm0, src, Assembler::AVX_512bit);
3759     Assembler::pcmpestri(dst, xmm0, imm8);
3760     pop_zmm(xmm0);
3761   } else {
3762     push_zmm(xmm0);
3763     push_zmm(xmm1);
3764     movdqu(xmm0, src);
3765     movdqu(xmm1, dst);
3766     Assembler::pcmpestri(xmm1, xmm0, imm8);
3767     movdqu(dst, xmm1);
3768     pop_zmm(xmm1);
3769     pop_zmm(xmm0);
3770   }
3771 }
3772 
3773 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
3774   int dst_enc = dst->encoding();
3775   int src_enc = src->encoding();
3776   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
3777     Assembler::pmovzxbw(dst, src);
3778   } else if ((dst_enc < 16) && (src_enc < 16)) {
3779     Assembler::pmovzxbw(dst, src);
3780   } else if (src_enc < 16) {
3781     push_zmm(xmm0);
3782     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3783     Assembler::pmovzxbw(xmm0, src);
3784     movdqu(dst, xmm0);
3785     pop_zmm(xmm0);
3786   } else if (dst_enc < 16) {
3787     push_zmm(xmm0);
3788     evmovdqul(xmm0, src, Assembler::AVX_512bit);
3789     Assembler::pmovzxbw(dst, xmm0);
3790     pop_zmm(xmm0);
3791   } else {
3792     push_zmm(xmm0);
3793     push_zmm(xmm1);
3794     movdqu(xmm0, src);
3795     movdqu(xmm1, dst);
3796     Assembler::pmovzxbw(xmm1, xmm0);
3797     movdqu(dst, xmm1);
3798     pop_zmm(xmm1);
3799     pop_zmm(xmm0);
3800   }
3801 }
3802 
3803 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
3804   int dst_enc = dst->encoding();
3805   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
3806     Assembler::pmovzxbw(dst, src);
3807   } else if (dst_enc < 16) {
3808     Assembler::pmovzxbw(dst, src);
3809   } else {
3810     push_zmm(xmm0);
3811     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3812     Assembler::pmovzxbw(xmm0, src);
3813     movdqu(dst, xmm0);
3814     pop_zmm(xmm0);
3815   }
3816 }
3817 
3818 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
3819   int src_enc = src->encoding();
3820   if (src_enc < 16) {
3821     Assembler::pmovmskb(dst, src);
3822   } else {
3823     push_zmm(xmm0);
3824     evmovdqul(xmm0, src, Assembler::AVX_512bit);
3825     Assembler::pmovmskb(dst, xmm0);
3826     pop_zmm(xmm0);
3827   }
3828 }
3829 
3830 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
3831   int dst_enc = dst->encoding();
3832   int src_enc = src->encoding();
3833   if ((dst_enc < 16) && (src_enc < 16)) {
3834     Assembler::ptest(dst, src);
3835   } else if (src_enc < 16) {
3836     push_zmm(xmm0);
3837     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3838     Assembler::ptest(xmm0, src);
3839     pop_zmm(xmm0);
3840   } else if (dst_enc < 16) {
3841     push_zmm(xmm0);
3842     evmovdqul(xmm0, src, Assembler::AVX_512bit);
3843     Assembler::ptest(dst, xmm0);
3844     pop_zmm(xmm0);
3845   } else {
3846     push_zmm(xmm0);
3847     push_zmm(xmm1);
3848     movdqu(xmm0, src);
3849     movdqu(xmm1, dst);
3850     Assembler::ptest(xmm1, xmm0);
3851     pop_zmm(xmm1);
3852     pop_zmm(xmm0);
3853   }
3854 }
3855 
3856 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
3857   if (reachable(src)) {
3858     Assembler::sqrtsd(dst, as_Address(src));
3859   } else {
3860     lea(rscratch1, src);
3861     Assembler::sqrtsd(dst, Address(rscratch1, 0));
3862   }
3863 }
3864 
3865 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
3866   if (reachable(src)) {
3867     Assembler::sqrtss(dst, as_Address(src));
3868   } else {
3869     lea(rscratch1, src);
3870     Assembler::sqrtss(dst, Address(rscratch1, 0));
3871   }
3872 }
3873 
3874 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
3875   if (reachable(src)) {
3876     Assembler::subsd(dst, as_Address(src));
3877   } else {
3878     lea(rscratch1, src);
3879     Assembler::subsd(dst, Address(rscratch1, 0));
3880   }
3881 }
3882 
3883 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
3884   if (reachable(src)) {
3885     Assembler::subss(dst, as_Address(src));
3886   } else {
3887     lea(rscratch1, src);
3888     Assembler::subss(dst, Address(rscratch1, 0));
3889   }
3890 }
3891 
3892 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
3893   if (reachable(src)) {
3894     Assembler::ucomisd(dst, as_Address(src));
3895   } else {
3896     lea(rscratch1, src);
3897     Assembler::ucomisd(dst, Address(rscratch1, 0));
3898   }
3899 }
3900 
3901 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
3902   if (reachable(src)) {
3903     Assembler::ucomiss(dst, as_Address(src));
3904   } else {
3905     lea(rscratch1, src);
3906     Assembler::ucomiss(dst, Address(rscratch1, 0));
3907   }
3908 }
3909 
3910 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
3911   // Used in sign-bit flipping with aligned address.
3912   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3913   if (reachable(src)) {
3914     Assembler::xorpd(dst, as_Address(src));
3915   } else {
3916     lea(rscratch1, src);
3917     Assembler::xorpd(dst, Address(rscratch1, 0));
3918   }
3919 }
3920 
3921 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
3922   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3923     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3924   }
3925   else {
3926     Assembler::xorpd(dst, src);
3927   }
3928 }
3929 
3930 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
3931   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3932     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3933   } else {
3934     Assembler::xorps(dst, src);
3935   }
3936 }
3937 
3938 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
3939   // Used in sign-bit flipping with aligned address.
3940   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3941   if (reachable(src)) {
3942     Assembler::xorps(dst, as_Address(src));
3943   } else {
3944     lea(rscratch1, src);
3945     Assembler::xorps(dst, Address(rscratch1, 0));
3946   }
3947 }
3948 
3949 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
3950   // Used in sign-bit flipping with aligned address.
3951   bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3952   assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3953   if (reachable(src)) {
3954     Assembler::pshufb(dst, as_Address(src));
3955   } else {
3956     lea(rscratch1, src);
3957     Assembler::pshufb(dst, Address(rscratch1, 0));
3958   }
3959 }
3960 
3961 // AVX 3-operands instructions
3962 
3963 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3964   if (reachable(src)) {
3965     vaddsd(dst, nds, as_Address(src));
3966   } else {
3967     lea(rscratch1, src);
3968     vaddsd(dst, nds, Address(rscratch1, 0));
3969   }
3970 }
3971 
3972 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3973   if (reachable(src)) {
3974     vaddss(dst, nds, as_Address(src));
3975   } else {
3976     lea(rscratch1, src);
3977     vaddss(dst, nds, Address(rscratch1, 0));
3978   }
3979 }
3980 
3981 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3982   int dst_enc = dst->encoding();
3983   int nds_enc = nds->encoding();
3984   int src_enc = src->encoding();
3985   if ((dst_enc < 16) && (nds_enc < 16)) {
3986     vandps(dst, nds, negate_field, vector_len);
3987   } else if ((src_enc < 16) && (dst_enc < 16)) {
3988     evmovdqul(src, nds, Assembler::AVX_512bit);
3989     vandps(dst, src, negate_field, vector_len);
3990   } else if (src_enc < 16) {
3991     evmovdqul(src, nds, Assembler::AVX_512bit);
3992     vandps(src, src, negate_field, vector_len);
3993     evmovdqul(dst, src, Assembler::AVX_512bit);
3994   } else if (dst_enc < 16) {
3995     evmovdqul(src, xmm0, Assembler::AVX_512bit);
3996     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
3997     vandps(dst, xmm0, negate_field, vector_len);
3998     evmovdqul(xmm0, src, Assembler::AVX_512bit);
3999   } else {
4000     if (src_enc != dst_enc) {
4001       evmovdqul(src, xmm0, Assembler::AVX_512bit);
4002       evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4003       vandps(xmm0, xmm0, negate_field, vector_len);
4004       evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4005       evmovdqul(xmm0, src, Assembler::AVX_512bit);
4006     } else {
4007       push_zmm(xmm0);
4008       evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4009       vandps(xmm0, xmm0, negate_field, vector_len);
4010       evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4011       pop_zmm(xmm0);
4012     }
4013   }
4014 }
4015 
4016 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
4017   int dst_enc = dst->encoding();
4018   int nds_enc = nds->encoding();
4019   int src_enc = src->encoding();
4020   if ((dst_enc < 16) && (nds_enc < 16)) {
4021     vandpd(dst, nds, negate_field, vector_len);
4022   } else if ((src_enc < 16) && (dst_enc < 16)) {
4023     evmovdqul(src, nds, Assembler::AVX_512bit);
4024     vandpd(dst, src, negate_field, vector_len);
4025   } else if (src_enc < 16) {
4026     evmovdqul(src, nds, Assembler::AVX_512bit);
4027     vandpd(src, src, negate_field, vector_len);
4028     evmovdqul(dst, src, Assembler::AVX_512bit);
4029   } else if (dst_enc < 16) {
4030     evmovdqul(src, xmm0, Assembler::AVX_512bit);
4031     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4032     vandpd(dst, xmm0, negate_field, vector_len);
4033     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4034   } else {
4035     if (src_enc != dst_enc) {
4036       evmovdqul(src, xmm0, Assembler::AVX_512bit);
4037       evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4038       vandpd(xmm0, xmm0, negate_field, vector_len);
4039       evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4040       evmovdqul(xmm0, src, Assembler::AVX_512bit);
4041     } else {
4042       push_zmm(xmm0);
4043       evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4044       vandpd(xmm0, xmm0, negate_field, vector_len);
4045       evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4046       pop_zmm(xmm0);
4047     }
4048   }
4049 }
4050 
4051 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4052   int dst_enc = dst->encoding();
4053   int nds_enc = nds->encoding();
4054   int src_enc = src->encoding();
4055   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4056     Assembler::vpaddb(dst, nds, src, vector_len);
4057   } else if ((dst_enc < 16) && (src_enc < 16)) {
4058     Assembler::vpaddb(dst, dst, src, vector_len);
4059   } else if ((dst_enc < 16) && (nds_enc < 16)) {
4060     // use nds as scratch for src
4061     evmovdqul(nds, src, Assembler::AVX_512bit);
4062     Assembler::vpaddb(dst, dst, nds, vector_len);
4063   } else if ((src_enc < 16) && (nds_enc < 16)) {
4064     // use nds as scratch for dst
4065     evmovdqul(nds, dst, Assembler::AVX_512bit);
4066     Assembler::vpaddb(nds, nds, src, vector_len);
4067     evmovdqul(dst, nds, Assembler::AVX_512bit);
4068   } else if (dst_enc < 16) {
4069     // use nds as scatch for xmm0 to hold src
4070     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4071     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4072     Assembler::vpaddb(dst, dst, xmm0, vector_len);
4073     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4074   } else {
4075     // worse case scenario, all regs are in the upper bank
4076     push_zmm(xmm1);
4077     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4078     evmovdqul(xmm1, src, Assembler::AVX_512bit);
4079     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4080     Assembler::vpaddb(xmm0, xmm0, xmm1, vector_len);
4081     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4082     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4083     pop_zmm(xmm1);
4084   }
4085 }
4086 
4087 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4088   int dst_enc = dst->encoding();
4089   int nds_enc = nds->encoding();
4090   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4091     Assembler::vpaddb(dst, nds, src, vector_len);
4092   } else if (dst_enc < 16) {
4093     Assembler::vpaddb(dst, dst, src, vector_len);
4094   } else if (nds_enc < 16) {
4095     // implies dst_enc in upper bank with src as scratch
4096     evmovdqul(nds, dst, Assembler::AVX_512bit);
4097     Assembler::vpaddb(nds, nds, src, vector_len);
4098     evmovdqul(dst, nds, Assembler::AVX_512bit);
4099   } else {
4100     // worse case scenario, all regs in upper bank
4101     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4102     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4103     Assembler::vpaddb(xmm0, xmm0, src, vector_len);
4104     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4105   }
4106 }
4107 
4108 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4109   int dst_enc = dst->encoding();
4110   int nds_enc = nds->encoding();
4111   int src_enc = src->encoding();
4112   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4113     Assembler::vpaddw(dst, nds, src, vector_len);
4114   } else if ((dst_enc < 16) && (src_enc < 16)) {
4115     Assembler::vpaddw(dst, dst, src, vector_len);
4116   } else if ((dst_enc < 16) && (nds_enc < 16)) {
4117     // use nds as scratch for src
4118     evmovdqul(nds, src, Assembler::AVX_512bit);
4119     Assembler::vpaddw(dst, dst, nds, vector_len);
4120   } else if ((src_enc < 16) && (nds_enc < 16)) {
4121     // use nds as scratch for dst
4122     evmovdqul(nds, dst, Assembler::AVX_512bit);
4123     Assembler::vpaddw(nds, nds, src, vector_len);
4124     evmovdqul(dst, nds, Assembler::AVX_512bit);
4125   } else if (dst_enc < 16) {
4126     // use nds as scatch for xmm0 to hold src
4127     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4128     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4129     Assembler::vpaddw(dst, dst, xmm0, vector_len);
4130     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4131   } else {
4132     // worse case scenario, all regs are in the upper bank
4133     push_zmm(xmm1);
4134     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4135     evmovdqul(xmm1, src, Assembler::AVX_512bit);
4136     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4137     Assembler::vpaddw(xmm0, xmm0, xmm1, vector_len);
4138     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4139     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4140     pop_zmm(xmm1);
4141   }
4142 }
4143 
4144 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4145   int dst_enc = dst->encoding();
4146   int nds_enc = nds->encoding();
4147   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4148     Assembler::vpaddw(dst, nds, src, vector_len);
4149   } else if (dst_enc < 16) {
4150     Assembler::vpaddw(dst, dst, src, vector_len);
4151   } else if (nds_enc < 16) {
4152     // implies dst_enc in upper bank with src as scratch
4153     evmovdqul(nds, dst, Assembler::AVX_512bit);
4154     Assembler::vpaddw(nds, nds, src, vector_len);
4155     evmovdqul(dst, nds, Assembler::AVX_512bit);
4156   } else {
4157     // worse case scenario, all regs in upper bank
4158     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4159     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4160     Assembler::vpaddw(xmm0, xmm0, src, vector_len);
4161     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4162   }
4163 }
4164 
4165 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4166   if (reachable(src)) {
4167     Assembler::vpand(dst, nds, as_Address(src), vector_len);
4168   } else {
4169     lea(rscratch1, src);
4170     Assembler::vpand(dst, nds, Address(rscratch1, 0), vector_len);
4171   }
4172 }
4173 
4174 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src) {
4175   int dst_enc = dst->encoding();
4176   int src_enc = src->encoding();
4177   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4178     Assembler::vpbroadcastw(dst, src);
4179   } else if ((dst_enc < 16) && (src_enc < 16)) {
4180     Assembler::vpbroadcastw(dst, src);
4181   } else if (src_enc < 16) {
4182     push_zmm(xmm0);
4183     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4184     Assembler::vpbroadcastw(xmm0, src);
4185     movdqu(dst, xmm0);
4186     pop_zmm(xmm0);
4187   } else if (dst_enc < 16) {
4188     push_zmm(xmm0);
4189     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4190     Assembler::vpbroadcastw(dst, xmm0);
4191     pop_zmm(xmm0);
4192   } else {
4193     push_zmm(xmm0);
4194     push_zmm(xmm1);
4195     movdqu(xmm0, src);
4196     movdqu(xmm1, dst);
4197     Assembler::vpbroadcastw(xmm1, xmm0);
4198     movdqu(dst, xmm1);
4199     pop_zmm(xmm1);
4200     pop_zmm(xmm0);
4201   }
4202 }
4203 
4204 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4205   int dst_enc = dst->encoding();
4206   int nds_enc = nds->encoding();
4207   int src_enc = src->encoding();
4208   assert(dst_enc == nds_enc, "");
4209   if ((dst_enc < 16) && (src_enc < 16)) {
4210     Assembler::vpcmpeqb(dst, nds, src, vector_len);
4211   } else if (src_enc < 16) {
4212     push_zmm(xmm0);
4213     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4214     Assembler::vpcmpeqb(xmm0, xmm0, src, vector_len);
4215     movdqu(dst, xmm0);
4216     pop_zmm(xmm0);
4217   } else if (dst_enc < 16) {
4218     push_zmm(xmm0);
4219     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4220     Assembler::vpcmpeqb(dst, dst, xmm0, vector_len);
4221     pop_zmm(xmm0);
4222   } else {
4223     push_zmm(xmm0);
4224     push_zmm(xmm1);
4225     movdqu(xmm0, src);
4226     movdqu(xmm1, dst);
4227     Assembler::vpcmpeqb(xmm1, xmm1, xmm0, vector_len);
4228     movdqu(dst, xmm1);
4229     pop_zmm(xmm1);
4230     pop_zmm(xmm0);
4231   }
4232 }
4233 
4234 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4235   int dst_enc = dst->encoding();
4236   int nds_enc = nds->encoding();
4237   int src_enc = src->encoding();
4238   assert(dst_enc == nds_enc, "");
4239   if ((dst_enc < 16) && (src_enc < 16)) {
4240     Assembler::vpcmpeqw(dst, nds, src, vector_len);
4241   } else if (src_enc < 16) {
4242     push_zmm(xmm0);
4243     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4244     Assembler::vpcmpeqw(xmm0, xmm0, src, vector_len);
4245     movdqu(dst, xmm0);
4246     pop_zmm(xmm0);
4247   } else if (dst_enc < 16) {
4248     push_zmm(xmm0);
4249     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4250     Assembler::vpcmpeqw(dst, dst, xmm0, vector_len);
4251     pop_zmm(xmm0);
4252   } else {
4253     push_zmm(xmm0);
4254     push_zmm(xmm1);
4255     movdqu(xmm0, src);
4256     movdqu(xmm1, dst);
4257     Assembler::vpcmpeqw(xmm1, xmm1, xmm0, vector_len);
4258     movdqu(dst, xmm1);
4259     pop_zmm(xmm1);
4260     pop_zmm(xmm0);
4261   }
4262 }
4263 
4264 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
4265   int dst_enc = dst->encoding();
4266   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4267     Assembler::vpmovzxbw(dst, src, vector_len);
4268   } else if (dst_enc < 16) {
4269     Assembler::vpmovzxbw(dst, src, vector_len);
4270   } else {
4271     push_zmm(xmm0);
4272     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4273     Assembler::vpmovzxbw(xmm0, src, vector_len);
4274     movdqu(dst, xmm0);
4275     pop_zmm(xmm0);
4276   }
4277 }
4278 
4279 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src) {
4280   int src_enc = src->encoding();
4281   if (src_enc < 16) {
4282     Assembler::vpmovmskb(dst, src);
4283   } else {
4284     push_zmm(xmm0);
4285     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4286     Assembler::vpmovmskb(dst, xmm0);
4287     pop_zmm(xmm0);
4288   }
4289 }
4290 
4291 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4292   int dst_enc = dst->encoding();
4293   int nds_enc = nds->encoding();
4294   int src_enc = src->encoding();
4295   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4296     Assembler::vpmullw(dst, nds, src, vector_len);
4297   } else if ((dst_enc < 16) && (src_enc < 16)) {
4298     Assembler::vpmullw(dst, dst, src, vector_len);
4299   } else if ((dst_enc < 16) && (nds_enc < 16)) {
4300     // use nds as scratch for src
4301     evmovdqul(nds, src, Assembler::AVX_512bit);
4302     Assembler::vpmullw(dst, dst, nds, vector_len);
4303   } else if ((src_enc < 16) && (nds_enc < 16)) {
4304     // use nds as scratch for dst
4305     evmovdqul(nds, dst, Assembler::AVX_512bit);
4306     Assembler::vpmullw(nds, nds, src, vector_len);
4307     evmovdqul(dst, nds, Assembler::AVX_512bit);
4308   } else if (dst_enc < 16) {
4309     // use nds as scatch for xmm0 to hold src
4310     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4311     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4312     Assembler::vpmullw(dst, dst, xmm0, vector_len);
4313     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4314   } else {
4315     // worse case scenario, all regs are in the upper bank
4316     push_zmm(xmm1);
4317     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4318     evmovdqul(xmm1, src, Assembler::AVX_512bit);
4319     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4320     Assembler::vpmullw(xmm0, xmm0, xmm1, vector_len);
4321     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4322     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4323     pop_zmm(xmm1);
4324   }
4325 }
4326 
4327 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4328   int dst_enc = dst->encoding();
4329   int nds_enc = nds->encoding();
4330   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4331     Assembler::vpmullw(dst, nds, src, vector_len);
4332   } else if (dst_enc < 16) {
4333     Assembler::vpmullw(dst, dst, src, vector_len);
4334   } else if (nds_enc < 16) {
4335     // implies dst_enc in upper bank with src as scratch
4336     evmovdqul(nds, dst, Assembler::AVX_512bit);
4337     Assembler::vpmullw(nds, nds, src, vector_len);
4338     evmovdqul(dst, nds, Assembler::AVX_512bit);
4339   } else {
4340     // worse case scenario, all regs in upper bank
4341     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4342     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4343     Assembler::vpmullw(xmm0, xmm0, src, vector_len);
4344     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4345   }
4346 }
4347 
4348 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4349   int dst_enc = dst->encoding();
4350   int nds_enc = nds->encoding();
4351   int src_enc = src->encoding();
4352   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4353     Assembler::vpsubb(dst, nds, src, vector_len);
4354   } else if ((dst_enc < 16) && (src_enc < 16)) {
4355     Assembler::vpsubb(dst, dst, src, vector_len);
4356   } else if ((dst_enc < 16) && (nds_enc < 16)) {
4357     // use nds as scratch for src
4358     evmovdqul(nds, src, Assembler::AVX_512bit);
4359     Assembler::vpsubb(dst, dst, nds, vector_len);
4360   } else if ((src_enc < 16) && (nds_enc < 16)) {
4361     // use nds as scratch for dst
4362     evmovdqul(nds, dst, Assembler::AVX_512bit);
4363     Assembler::vpsubb(nds, nds, src, vector_len);
4364     evmovdqul(dst, nds, Assembler::AVX_512bit);
4365   } else if (dst_enc < 16) {
4366     // use nds as scatch for xmm0 to hold src
4367     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4368     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4369     Assembler::vpsubb(dst, dst, xmm0, vector_len);
4370     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4371   } else {
4372     // worse case scenario, all regs are in the upper bank
4373     push_zmm(xmm1);
4374     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4375     evmovdqul(xmm1, src, Assembler::AVX_512bit);
4376     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4377     Assembler::vpsubb(xmm0, xmm0, xmm1, vector_len);
4378     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4379     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4380     pop_zmm(xmm1);
4381   }
4382 }
4383 
4384 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4385   int dst_enc = dst->encoding();
4386   int nds_enc = nds->encoding();
4387   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4388     Assembler::vpsubb(dst, nds, src, vector_len);
4389   } else if (dst_enc < 16) {
4390     Assembler::vpsubb(dst, dst, src, vector_len);
4391   } else if (nds_enc < 16) {
4392     // implies dst_enc in upper bank with src as scratch
4393     evmovdqul(nds, dst, Assembler::AVX_512bit);
4394     Assembler::vpsubb(nds, nds, src, vector_len);
4395     evmovdqul(dst, nds, Assembler::AVX_512bit);
4396   } else {
4397     // worse case scenario, all regs in upper bank
4398     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4399     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4400     Assembler::vpsubw(xmm0, xmm0, src, vector_len);
4401     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4402   }
4403 }
4404 
4405 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4406   int dst_enc = dst->encoding();
4407   int nds_enc = nds->encoding();
4408   int src_enc = src->encoding();
4409   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4410     Assembler::vpsubw(dst, nds, src, vector_len);
4411   } else if ((dst_enc < 16) && (src_enc < 16)) {
4412     Assembler::vpsubw(dst, dst, src, vector_len);
4413   } else if ((dst_enc < 16) && (nds_enc < 16)) {
4414     // use nds as scratch for src
4415     evmovdqul(nds, src, Assembler::AVX_512bit);
4416     Assembler::vpsubw(dst, dst, nds, vector_len);
4417   } else if ((src_enc < 16) && (nds_enc < 16)) {
4418     // use nds as scratch for dst
4419     evmovdqul(nds, dst, Assembler::AVX_512bit);
4420     Assembler::vpsubw(nds, nds, src, vector_len);
4421     evmovdqul(dst, nds, Assembler::AVX_512bit);
4422   } else if (dst_enc < 16) {
4423     // use nds as scatch for xmm0 to hold src
4424     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4425     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4426     Assembler::vpsubw(dst, dst, xmm0, vector_len);
4427     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4428   } else {
4429     // worse case scenario, all regs are in the upper bank
4430     push_zmm(xmm1);
4431     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4432     evmovdqul(xmm1, src, Assembler::AVX_512bit);
4433     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4434     Assembler::vpsubw(xmm0, xmm0, xmm1, vector_len);
4435     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4436     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4437     pop_zmm(xmm1);
4438   }
4439 }
4440 
4441 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4442   int dst_enc = dst->encoding();
4443   int nds_enc = nds->encoding();
4444   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4445     Assembler::vpsubw(dst, nds, src, vector_len);
4446   } else if (dst_enc < 16) {
4447     Assembler::vpsubw(dst, dst, src, vector_len);
4448   } else if (nds_enc < 16) {
4449     // implies dst_enc in upper bank with src as scratch
4450     evmovdqul(nds, dst, Assembler::AVX_512bit);
4451     Assembler::vpsubw(nds, nds, src, vector_len);
4452     evmovdqul(dst, nds, Assembler::AVX_512bit);
4453   } else {
4454     // worse case scenario, all regs in upper bank
4455     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4456     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4457     Assembler::vpsubw(xmm0, xmm0, src, vector_len);
4458     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4459   }
4460 }
4461 
4462 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4463   int dst_enc = dst->encoding();
4464   int nds_enc = nds->encoding();
4465   int shift_enc = shift->encoding();
4466   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4467     Assembler::vpsraw(dst, nds, shift, vector_len);
4468   } else if ((dst_enc < 16) && (shift_enc < 16)) {
4469     Assembler::vpsraw(dst, dst, shift, vector_len);
4470   } else if ((dst_enc < 16) && (nds_enc < 16)) {
4471     // use nds_enc as scratch with shift
4472     evmovdqul(nds, shift, Assembler::AVX_512bit);
4473     Assembler::vpsraw(dst, dst, nds, vector_len);
4474   } else if ((shift_enc < 16) && (nds_enc < 16)) {
4475     // use nds as scratch with dst
4476     evmovdqul(nds, dst, Assembler::AVX_512bit);
4477     Assembler::vpsraw(nds, nds, shift, vector_len);
4478     evmovdqul(dst, nds, Assembler::AVX_512bit);
4479   } else if (dst_enc < 16) {
4480     // use nds to save a copy of xmm0 and hold shift
4481     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4482     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4483     Assembler::vpsraw(dst, dst, xmm0, vector_len);
4484     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4485   } else if (nds_enc < 16) {
4486     // use nds as dest as temps
4487     evmovdqul(nds, dst, Assembler::AVX_512bit);
4488     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4489     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4490     Assembler::vpsraw(nds, nds, xmm0, vector_len);
4491     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4492     evmovdqul(dst, nds, Assembler::AVX_512bit);
4493   } else {
4494     // worse case scenario, all regs are in the upper bank
4495     push_zmm(xmm1);
4496     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4497     evmovdqul(xmm1, shift, Assembler::AVX_512bit);
4498     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4499     Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
4500     evmovdqul(xmm1, dst, Assembler::AVX_512bit);
4501     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4502     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4503     pop_zmm(xmm1);
4504   }
4505 }
4506 
4507 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4508   int dst_enc = dst->encoding();
4509   int nds_enc = nds->encoding();
4510   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4511     Assembler::vpsraw(dst, nds, shift, vector_len);
4512   } else if (dst_enc < 16) {
4513     Assembler::vpsraw(dst, dst, shift, vector_len);
4514   } else if (nds_enc < 16) {
4515     // use nds as scratch
4516     evmovdqul(nds, dst, Assembler::AVX_512bit);
4517     Assembler::vpsraw(nds, nds, shift, vector_len);
4518     evmovdqul(dst, nds, Assembler::AVX_512bit);
4519   } else {
4520     // use nds as scratch for xmm0
4521     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4522     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4523     Assembler::vpsraw(xmm0, xmm0, shift, vector_len);
4524     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4525   }
4526 }
4527 
4528 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4529   int dst_enc = dst->encoding();
4530   int nds_enc = nds->encoding();
4531   int shift_enc = shift->encoding();
4532   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4533     Assembler::vpsrlw(dst, nds, shift, vector_len);
4534   } else if ((dst_enc < 16) && (shift_enc < 16)) {
4535     Assembler::vpsrlw(dst, dst, shift, vector_len);
4536   } else if ((dst_enc < 16) && (nds_enc < 16)) {
4537     // use nds_enc as scratch with shift
4538     evmovdqul(nds, shift, Assembler::AVX_512bit);
4539     Assembler::vpsrlw(dst, dst, nds, vector_len);
4540   } else if ((shift_enc < 16) && (nds_enc < 16)) {
4541     // use nds as scratch with dst
4542     evmovdqul(nds, dst, Assembler::AVX_512bit);
4543     Assembler::vpsrlw(nds, nds, shift, vector_len);
4544     evmovdqul(dst, nds, Assembler::AVX_512bit);
4545   } else if (dst_enc < 16) {
4546     // use nds to save a copy of xmm0 and hold shift
4547     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4548     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4549     Assembler::vpsrlw(dst, dst, xmm0, vector_len);
4550     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4551   } else if (nds_enc < 16) {
4552     // use nds as dest as temps
4553     evmovdqul(nds, dst, Assembler::AVX_512bit);
4554     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4555     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4556     Assembler::vpsrlw(nds, nds, xmm0, vector_len);
4557     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4558     evmovdqul(dst, nds, Assembler::AVX_512bit);
4559   } else {
4560     // worse case scenario, all regs are in the upper bank
4561     push_zmm(xmm1);
4562     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4563     evmovdqul(xmm1, shift, Assembler::AVX_512bit);
4564     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4565     Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
4566     evmovdqul(xmm1, dst, Assembler::AVX_512bit);
4567     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4568     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4569     pop_zmm(xmm1);
4570   }
4571 }
4572 
4573 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4574   int dst_enc = dst->encoding();
4575   int nds_enc = nds->encoding();
4576   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4577     Assembler::vpsrlw(dst, nds, shift, vector_len);
4578   } else if (dst_enc < 16) {
4579     Assembler::vpsrlw(dst, dst, shift, vector_len);
4580   } else if (nds_enc < 16) {
4581     // use nds as scratch
4582     evmovdqul(nds, dst, Assembler::AVX_512bit);
4583     Assembler::vpsrlw(nds, nds, shift, vector_len);
4584     evmovdqul(dst, nds, Assembler::AVX_512bit);
4585   } else {
4586     // use nds as scratch for xmm0
4587     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4588     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4589     Assembler::vpsrlw(xmm0, xmm0, shift, vector_len);
4590     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4591   }
4592 }
4593 
4594 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4595   int dst_enc = dst->encoding();
4596   int nds_enc = nds->encoding();
4597   int shift_enc = shift->encoding();
4598   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4599     Assembler::vpsllw(dst, nds, shift, vector_len);
4600   } else if ((dst_enc < 16) && (shift_enc < 16)) {
4601     Assembler::vpsllw(dst, dst, shift, vector_len);
4602   } else if ((dst_enc < 16) && (nds_enc < 16)) {
4603     // use nds_enc as scratch with shift
4604     evmovdqul(nds, shift, Assembler::AVX_512bit);
4605     Assembler::vpsllw(dst, dst, nds, vector_len);
4606   } else if ((shift_enc < 16) && (nds_enc < 16)) {
4607     // use nds as scratch with dst
4608     evmovdqul(nds, dst, Assembler::AVX_512bit);
4609     Assembler::vpsllw(nds, nds, shift, vector_len);
4610     evmovdqul(dst, nds, Assembler::AVX_512bit);
4611   } else if (dst_enc < 16) {
4612     // use nds to save a copy of xmm0 and hold shift
4613     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4614     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4615     Assembler::vpsllw(dst, dst, xmm0, vector_len);
4616     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4617   } else if (nds_enc < 16) {
4618     // use nds as dest as temps
4619     evmovdqul(nds, dst, Assembler::AVX_512bit);
4620     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4621     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4622     Assembler::vpsllw(nds, nds, xmm0, vector_len);
4623     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4624     evmovdqul(dst, nds, Assembler::AVX_512bit);
4625   } else {
4626     // worse case scenario, all regs are in the upper bank
4627     push_zmm(xmm1);
4628     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4629     evmovdqul(xmm1, shift, Assembler::AVX_512bit);
4630     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4631     Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
4632     evmovdqul(xmm1, dst, Assembler::AVX_512bit);
4633     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4634     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4635     pop_zmm(xmm1);
4636   }
4637 }
4638 
4639 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4640   int dst_enc = dst->encoding();
4641   int nds_enc = nds->encoding();
4642   if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4643     Assembler::vpsllw(dst, nds, shift, vector_len);
4644   } else if (dst_enc < 16) {
4645     Assembler::vpsllw(dst, dst, shift, vector_len);
4646   } else if (nds_enc < 16) {
4647     // use nds as scratch
4648     evmovdqul(nds, dst, Assembler::AVX_512bit);
4649     Assembler::vpsllw(nds, nds, shift, vector_len);
4650     evmovdqul(dst, nds, Assembler::AVX_512bit);
4651   } else {
4652     // use nds as scratch for xmm0
4653     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4654     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4655     Assembler::vpsllw(xmm0, xmm0, shift, vector_len);
4656     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4657   }
4658 }
4659 
4660 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
4661   int dst_enc = dst->encoding();
4662   int src_enc = src->encoding();
4663   if ((dst_enc < 16) && (src_enc < 16)) {
4664     Assembler::vptest(dst, src);
4665   } else if (src_enc < 16) {
4666     push_zmm(xmm0);
4667     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4668     Assembler::vptest(xmm0, src);
4669     pop_zmm(xmm0);
4670   } else if (dst_enc < 16) {
4671     push_zmm(xmm0);
4672     evmovdqul(xmm0, src, Assembler::AVX_512bit);
4673     Assembler::vptest(dst, xmm0);
4674     pop_zmm(xmm0);
4675   } else {
4676     push_zmm(xmm0);
4677     push_zmm(xmm1);
4678     movdqu(xmm0, src);
4679     movdqu(xmm1, dst);
4680     Assembler::vptest(xmm1, xmm0);
4681     pop_zmm(xmm1);
4682     pop_zmm(xmm0);
4683   }
4684 }
4685 
4686 // This instruction exists within macros, ergo we cannot control its input
4687 // when emitted through those patterns.
4688 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
4689   if (VM_Version::supports_avx512nobw()) {
4690     int dst_enc = dst->encoding();
4691     int src_enc = src->encoding();
4692     if (dst_enc == src_enc) {
4693       if (dst_enc < 16) {
4694         Assembler::punpcklbw(dst, src);
4695       } else {
4696         push_zmm(xmm0);
4697         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4698         Assembler::punpcklbw(xmm0, xmm0);
4699         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4700         pop_zmm(xmm0);
4701       }
4702     } else {
4703       if ((src_enc < 16) && (dst_enc < 16)) {
4704         Assembler::punpcklbw(dst, src);
4705       } else if (src_enc < 16) {
4706         push_zmm(xmm0);
4707         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4708         Assembler::punpcklbw(xmm0, src);
4709         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4710         pop_zmm(xmm0);
4711       } else if (dst_enc < 16) {
4712         push_zmm(xmm0);
4713         evmovdqul(xmm0, src, Assembler::AVX_512bit);
4714         Assembler::punpcklbw(dst, xmm0);
4715         pop_zmm(xmm0);
4716       } else {
4717         push_zmm(xmm0);
4718         push_zmm(xmm1);
4719         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4720         evmovdqul(xmm1, src, Assembler::AVX_512bit);
4721         Assembler::punpcklbw(xmm0, xmm1);
4722         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4723         pop_zmm(xmm1);
4724         pop_zmm(xmm0);
4725       }
4726     }
4727   } else {
4728     Assembler::punpcklbw(dst, src);
4729   }
4730 }
4731 
4732 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
4733   if (VM_Version::supports_avx512vl()) {
4734     Assembler::pshufd(dst, src, mode);
4735   } else {
4736     int dst_enc = dst->encoding();
4737     if (dst_enc < 16) {
4738       Assembler::pshufd(dst, src, mode);
4739     } else {
4740       push_zmm(xmm0);
4741       Assembler::pshufd(xmm0, src, mode);
4742       evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4743       pop_zmm(xmm0);
4744     }
4745   }
4746 }
4747 
4748 // This instruction exists within macros, ergo we cannot control its input
4749 // when emitted through those patterns.
4750 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
4751   if (VM_Version::supports_avx512nobw()) {
4752     int dst_enc = dst->encoding();
4753     int src_enc = src->encoding();
4754     if (dst_enc == src_enc) {
4755       if (dst_enc < 16) {
4756         Assembler::pshuflw(dst, src, mode);
4757       } else {
4758         push_zmm(xmm0);
4759         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4760         Assembler::pshuflw(xmm0, xmm0, mode);
4761         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4762         pop_zmm(xmm0);
4763       }
4764     } else {
4765       if ((src_enc < 16) && (dst_enc < 16)) {
4766         Assembler::pshuflw(dst, src, mode);
4767       } else if (src_enc < 16) {
4768         push_zmm(xmm0);
4769         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4770         Assembler::pshuflw(xmm0, src, mode);
4771         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4772         pop_zmm(xmm0);
4773       } else if (dst_enc < 16) {
4774         push_zmm(xmm0);
4775         evmovdqul(xmm0, src, Assembler::AVX_512bit);
4776         Assembler::pshuflw(dst, xmm0, mode);
4777         pop_zmm(xmm0);
4778       } else {
4779         push_zmm(xmm0);
4780         push_zmm(xmm1);
4781         evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4782         evmovdqul(xmm1, src, Assembler::AVX_512bit);
4783         Assembler::pshuflw(xmm0, xmm1, mode);
4784         evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4785         pop_zmm(xmm1);
4786         pop_zmm(xmm0);
4787       }
4788     }
4789   } else {
4790     Assembler::pshuflw(dst, src, mode);
4791   }
4792 }
4793 
4794 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4795   if (reachable(src)) {
4796     vandpd(dst, nds, as_Address(src), vector_len);
4797   } else {
4798     lea(rscratch1, src);
4799     vandpd(dst, nds, Address(rscratch1, 0), vector_len);
4800   }
4801 }
4802 
4803 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4804   if (reachable(src)) {
4805     vandps(dst, nds, as_Address(src), vector_len);
4806   } else {
4807     lea(rscratch1, src);
4808     vandps(dst, nds, Address(rscratch1, 0), vector_len);
4809   }
4810 }
4811 
4812 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4813   if (reachable(src)) {
4814     vdivsd(dst, nds, as_Address(src));
4815   } else {
4816     lea(rscratch1, src);
4817     vdivsd(dst, nds, Address(rscratch1, 0));
4818   }
4819 }
4820 
4821 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4822   if (reachable(src)) {
4823     vdivss(dst, nds, as_Address(src));
4824   } else {
4825     lea(rscratch1, src);
4826     vdivss(dst, nds, Address(rscratch1, 0));
4827   }
4828 }
4829 
4830 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4831   if (reachable(src)) {
4832     vmulsd(dst, nds, as_Address(src));
4833   } else {
4834     lea(rscratch1, src);
4835     vmulsd(dst, nds, Address(rscratch1, 0));
4836   }
4837 }
4838 
4839 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4840   if (reachable(src)) {
4841     vmulss(dst, nds, as_Address(src));
4842   } else {
4843     lea(rscratch1, src);
4844     vmulss(dst, nds, Address(rscratch1, 0));
4845   }
4846 }
4847 
4848 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4849   if (reachable(src)) {
4850     vsubsd(dst, nds, as_Address(src));
4851   } else {
4852     lea(rscratch1, src);
4853     vsubsd(dst, nds, Address(rscratch1, 0));
4854   }
4855 }
4856 
4857 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4858   if (reachable(src)) {
4859     vsubss(dst, nds, as_Address(src));
4860   } else {
4861     lea(rscratch1, src);
4862     vsubss(dst, nds, Address(rscratch1, 0));
4863   }
4864 }
4865 
4866 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4867   int nds_enc = nds->encoding();
4868   int dst_enc = dst->encoding();
4869   bool dst_upper_bank = (dst_enc > 15);
4870   bool nds_upper_bank = (nds_enc > 15);
4871   if (VM_Version::supports_avx512novl() &&
4872       (nds_upper_bank || dst_upper_bank)) {
4873     if (dst_upper_bank) {
4874       push_zmm(xmm0);
4875       movflt(xmm0, nds);
4876       vxorps(xmm0, xmm0, src, Assembler::AVX_128bit);
4877       movflt(dst, xmm0);
4878       pop_zmm(xmm0);
4879     } else {
4880       movflt(dst, nds);
4881       vxorps(dst, dst, src, Assembler::AVX_128bit);
4882     }
4883   } else {
4884     vxorps(dst, nds, src, Assembler::AVX_128bit);
4885   }
4886 }
4887 
4888 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4889   int nds_enc = nds->encoding();
4890   int dst_enc = dst->encoding();
4891   bool dst_upper_bank = (dst_enc > 15);
4892   bool nds_upper_bank = (nds_enc > 15);
4893   if (VM_Version::supports_avx512novl() &&
4894       (nds_upper_bank || dst_upper_bank)) {
4895     if (dst_upper_bank) {
4896       push_zmm(xmm0);
4897       movdbl(xmm0, nds);
4898       vxorpd(xmm0, xmm0, src, Assembler::AVX_128bit);
4899       movdbl(dst, xmm0);
4900       pop_zmm(xmm0);
4901     } else {
4902       movdbl(dst, nds);
4903       vxorpd(dst, dst, src, Assembler::AVX_128bit);
4904     }
4905   } else {
4906     vxorpd(dst, nds, src, Assembler::AVX_128bit);
4907   }
4908 }
4909 
4910 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4911   if (reachable(src)) {
4912     vxorpd(dst, nds, as_Address(src), vector_len);
4913   } else {
4914     lea(rscratch1, src);
4915     vxorpd(dst, nds, Address(rscratch1, 0), vector_len);
4916   }
4917 }
4918 
4919 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4920   if (reachable(src)) {
4921     vxorps(dst, nds, as_Address(src), vector_len);
4922   } else {
4923     lea(rscratch1, src);
4924     vxorps(dst, nds, Address(rscratch1, 0), vector_len);
4925   }
4926 }
4927 
4928 void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
4929   const int32_t inverted_jweak_mask = ~static_cast<int32_t>(JNIHandles::weak_tag_mask);
4930   STATIC_ASSERT(inverted_jweak_mask == -2); // otherwise check this code
4931   // The inverted mask is sign-extended
4932   andptr(possibly_jweak, inverted_jweak_mask);
4933 }
4934 
4935 void MacroAssembler::resolve_jobject(Register value,
4936                                      Register thread,
4937                                      Register tmp) {
4938   assert_different_registers(value, thread, tmp);
4939   Label done, not_weak;
4940   testptr(value, value);
4941   jcc(Assembler::zero, done);                // Use NULL as-is.
4942   testptr(value, JNIHandles::weak_tag_mask); // Test for jweak tag.
4943   jcc(Assembler::zero, not_weak);
4944   // Resolve jweak.
4945   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
4946                  value, Address(value, -JNIHandles::weak_tag_value), tmp, thread);
4947   verify_oop(value);
4948   jmp(done);
4949   bind(not_weak);
4950   // Resolve (untagged) jobject.
4951   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
4952   verify_oop(value);
4953   bind(done);
4954 }
4955 
4956 void MacroAssembler::subptr(Register dst, int32_t imm32) {
4957   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
4958 }
4959 
4960 // Force generation of a 4 byte immediate value even if it fits into 8bit
4961 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
4962   LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
4963 }
4964 
4965 void MacroAssembler::subptr(Register dst, Register src) {
4966   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
4967 }
4968 
4969 // C++ bool manipulation
4970 void MacroAssembler::testbool(Register dst) {
4971   if(sizeof(bool) == 1)
4972     testb(dst, 0xff);
4973   else if(sizeof(bool) == 2) {
4974     // testw implementation needed for two byte bools
4975     ShouldNotReachHere();
4976   } else if(sizeof(bool) == 4)
4977     testl(dst, dst);
4978   else
4979     // unsupported
4980     ShouldNotReachHere();
4981 }
4982 
4983 void MacroAssembler::testptr(Register dst, Register src) {
4984   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
4985 }
4986 
4987 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4988 void MacroAssembler::tlab_allocate(Register thread, Register obj,
4989                                    Register var_size_in_bytes,
4990                                    int con_size_in_bytes,
4991                                    Register t1,
4992                                    Register t2,
4993                                    Label& slow_case) {
4994   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4995   bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4996 }
4997 
4998 // Defines obj, preserves var_size_in_bytes
4999 void MacroAssembler::eden_allocate(Register thread, Register obj,
5000                                    Register var_size_in_bytes,
5001                                    int con_size_in_bytes,
5002                                    Register t1,
5003                                    Label& slow_case) {
5004   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5005   bs->eden_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
5006 }
5007 
5008 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
5009 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
5010   assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
5011   assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
5012   Label done;
5013 
5014   testptr(length_in_bytes, length_in_bytes);
5015   jcc(Assembler::zero, done);
5016 
5017   // initialize topmost word, divide index by 2, check if odd and test if zero
5018   // note: for the remaining code to work, index must be a multiple of BytesPerWord
5019 #ifdef ASSERT
5020   {
5021     Label L;
5022     testptr(length_in_bytes, BytesPerWord - 1);
5023     jcc(Assembler::zero, L);
5024     stop("length must be a multiple of BytesPerWord");
5025     bind(L);
5026   }
5027 #endif
5028   Register index = length_in_bytes;
5029   xorptr(temp, temp);    // use _zero reg to clear memory (shorter code)
5030   if (UseIncDec) {
5031     shrptr(index, 3);  // divide by 8/16 and set carry flag if bit 2 was set
5032   } else {
5033     shrptr(index, 2);  // use 2 instructions to avoid partial flag stall
5034     shrptr(index, 1);
5035   }
5036 #ifndef _LP64
5037   // index could have not been a multiple of 8 (i.e., bit 2 was set)
5038   {
5039     Label even;
5040     // note: if index was a multiple of 8, then it cannot
5041     //       be 0 now otherwise it must have been 0 before
5042     //       => if it is even, we don't need to check for 0 again
5043     jcc(Assembler::carryClear, even);
5044     // clear topmost word (no jump would be needed if conditional assignment worked here)
5045     movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
5046     // index could be 0 now, must check again
5047     jcc(Assembler::zero, done);
5048     bind(even);
5049   }
5050 #endif // !_LP64
5051   // initialize remaining object fields: index is a multiple of 2 now
5052   {
5053     Label loop;
5054     bind(loop);
5055     movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
5056     NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
5057     decrement(index);
5058     jcc(Assembler::notZero, loop);
5059   }
5060 
5061   bind(done);
5062 }
5063 
5064 // Look up the method for a megamorphic invokeinterface call.
5065 // The target method is determined by <intf_klass, itable_index>.
5066 // The receiver klass is in recv_klass.
5067 // On success, the result will be in method_result, and execution falls through.
5068 // On failure, execution transfers to the given label.
5069 void MacroAssembler::lookup_interface_method(Register recv_klass,
5070                                              Register intf_klass,
5071                                              RegisterOrConstant itable_index,
5072                                              Register method_result,
5073                                              Register scan_temp,
5074                                              Label& L_no_such_interface,
5075                                              bool return_method) {
5076   assert_different_registers(recv_klass, intf_klass, scan_temp);
5077   assert_different_registers(method_result, intf_klass, scan_temp);
5078   assert(recv_klass != method_result || !return_method,
5079          "recv_klass can be destroyed when method isn't needed");
5080 
5081   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
5082          "caller must use same register for non-constant itable index as for method");
5083 
5084   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
5085   int vtable_base = in_bytes(Klass::vtable_start_offset());
5086   int itentry_off = itableMethodEntry::method_offset_in_bytes();
5087   int scan_step   = itableOffsetEntry::size() * wordSize;
5088   int vte_size    = vtableEntry::size_in_bytes();
5089   Address::ScaleFactor times_vte_scale = Address::times_ptr;
5090   assert(vte_size == wordSize, "else adjust times_vte_scale");
5091 
5092   movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
5093 
5094   // %%% Could store the aligned, prescaled offset in the klassoop.
5095   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
5096 
5097   if (return_method) {
5098     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
5099     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
5100     lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
5101   }
5102 
5103   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
5104   //   if (scan->interface() == intf) {
5105   //     result = (klass + scan->offset() + itable_index);
5106   //   }
5107   // }
5108   Label search, found_method;
5109 
5110   for (int peel = 1; peel >= 0; peel--) {
5111     movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
5112     cmpptr(intf_klass, method_result);
5113 
5114     if (peel) {
5115       jccb(Assembler::equal, found_method);
5116     } else {
5117       jccb(Assembler::notEqual, search);
5118       // (invert the test to fall through to found_method...)
5119     }
5120 
5121     if (!peel)  break;
5122 
5123     bind(search);
5124 
5125     // Check that the previous entry is non-null.  A null entry means that
5126     // the receiver class doesn't implement the interface, and wasn't the
5127     // same as when the caller was compiled.
5128     testptr(method_result, method_result);
5129     jcc(Assembler::zero, L_no_such_interface);
5130     addptr(scan_temp, scan_step);
5131   }
5132 
5133   bind(found_method);
5134 
5135   if (return_method) {
5136     // Got a hit.
5137     movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
5138     movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
5139   }
5140 }
5141 
5142 
5143 // virtual method calling
5144 void MacroAssembler::lookup_virtual_method(Register recv_klass,
5145                                            RegisterOrConstant vtable_index,
5146                                            Register method_result) {
5147   const int base = in_bytes(Klass::vtable_start_offset());
5148   assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
5149   Address vtable_entry_addr(recv_klass,
5150                             vtable_index, Address::times_ptr,
5151                             base + vtableEntry::method_offset_in_bytes());
5152   movptr(method_result, vtable_entry_addr);
5153 }
5154 
5155 
5156 void MacroAssembler::check_klass_subtype(Register sub_klass,
5157                            Register super_klass,
5158                            Register temp_reg,
5159                            Label& L_success) {
5160   Label L_failure;
5161   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
5162   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
5163   bind(L_failure);
5164 }
5165 
5166 
5167 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
5168                                                    Register super_klass,
5169                                                    Register temp_reg,
5170                                                    Label* L_success,
5171                                                    Label* L_failure,
5172                                                    Label* L_slow_path,
5173                                         RegisterOrConstant super_check_offset) {
5174   assert_different_registers(sub_klass, super_klass, temp_reg);
5175   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
5176   if (super_check_offset.is_register()) {
5177     assert_different_registers(sub_klass, super_klass,
5178                                super_check_offset.as_register());
5179   } else if (must_load_sco) {
5180     assert(temp_reg != noreg, "supply either a temp or a register offset");
5181   }
5182 
5183   Label L_fallthrough;
5184   int label_nulls = 0;
5185   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
5186   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
5187   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
5188   assert(label_nulls <= 1, "at most one NULL in the batch");
5189 
5190   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
5191   int sco_offset = in_bytes(Klass::super_check_offset_offset());
5192   Address super_check_offset_addr(super_klass, sco_offset);
5193 
5194   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
5195   // range of a jccb.  If this routine grows larger, reconsider at
5196   // least some of these.
5197 #define local_jcc(assembler_cond, label)                                \
5198   if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
5199   else                             jcc( assembler_cond, label) /*omit semi*/
5200 
5201   // Hacked jmp, which may only be used just before L_fallthrough.
5202 #define final_jmp(label)                                                \
5203   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
5204   else                            jmp(label)                /*omit semi*/
5205 
5206   // If the pointers are equal, we are done (e.g., String[] elements).
5207   // This self-check enables sharing of secondary supertype arrays among
5208   // non-primary types such as array-of-interface.  Otherwise, each such
5209   // type would need its own customized SSA.
5210   // We move this check to the front of the fast path because many
5211   // type checks are in fact trivially successful in this manner,
5212   // so we get a nicely predicted branch right at the start of the check.
5213   cmpptr(sub_klass, super_klass);
5214   local_jcc(Assembler::equal, *L_success);
5215 
5216   // Check the supertype display:
5217   if (must_load_sco) {
5218     // Positive movl does right thing on LP64.
5219     movl(temp_reg, super_check_offset_addr);
5220     super_check_offset = RegisterOrConstant(temp_reg);
5221   }
5222   Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
5223   cmpptr(super_klass, super_check_addr); // load displayed supertype
5224 
5225   // This check has worked decisively for primary supers.
5226   // Secondary supers are sought in the super_cache ('super_cache_addr').
5227   // (Secondary supers are interfaces and very deeply nested subtypes.)
5228   // This works in the same check above because of a tricky aliasing
5229   // between the super_cache and the primary super display elements.
5230   // (The 'super_check_addr' can address either, as the case requires.)
5231   // Note that the cache is updated below if it does not help us find
5232   // what we need immediately.
5233   // So if it was a primary super, we can just fail immediately.
5234   // Otherwise, it's the slow path for us (no success at this point).
5235 
5236   if (super_check_offset.is_register()) {
5237     local_jcc(Assembler::equal, *L_success);
5238     cmpl(super_check_offset.as_register(), sc_offset);
5239     if (L_failure == &L_fallthrough) {
5240       local_jcc(Assembler::equal, *L_slow_path);
5241     } else {
5242       local_jcc(Assembler::notEqual, *L_failure);
5243       final_jmp(*L_slow_path);
5244     }
5245   } else if (super_check_offset.as_constant() == sc_offset) {
5246     // Need a slow path; fast failure is impossible.
5247     if (L_slow_path == &L_fallthrough) {
5248       local_jcc(Assembler::equal, *L_success);
5249     } else {
5250       local_jcc(Assembler::notEqual, *L_slow_path);
5251       final_jmp(*L_success);
5252     }
5253   } else {
5254     // No slow path; it's a fast decision.
5255     if (L_failure == &L_fallthrough) {
5256       local_jcc(Assembler::equal, *L_success);
5257     } else {
5258       local_jcc(Assembler::notEqual, *L_failure);
5259       final_jmp(*L_success);
5260     }
5261   }
5262 
5263   bind(L_fallthrough);
5264 
5265 #undef local_jcc
5266 #undef final_jmp
5267 }
5268 
5269 
5270 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
5271                                                    Register super_klass,
5272                                                    Register temp_reg,
5273                                                    Register temp2_reg,
5274                                                    Label* L_success,
5275                                                    Label* L_failure,
5276                                                    bool set_cond_codes) {
5277   assert_different_registers(sub_klass, super_klass, temp_reg);
5278   if (temp2_reg != noreg)
5279     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
5280 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
5281 
5282   Label L_fallthrough;
5283   int label_nulls = 0;
5284   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
5285   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
5286   assert(label_nulls <= 1, "at most one NULL in the batch");
5287 
5288   // a couple of useful fields in sub_klass:
5289   int ss_offset = in_bytes(Klass::secondary_supers_offset());
5290   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
5291   Address secondary_supers_addr(sub_klass, ss_offset);
5292   Address super_cache_addr(     sub_klass, sc_offset);
5293 
5294   // Do a linear scan of the secondary super-klass chain.
5295   // This code is rarely used, so simplicity is a virtue here.
5296   // The repne_scan instruction uses fixed registers, which we must spill.
5297   // Don't worry too much about pre-existing connections with the input regs.
5298 
5299   assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
5300   assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
5301 
5302   // Get super_klass value into rax (even if it was in rdi or rcx).
5303   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
5304   if (super_klass != rax || UseCompressedOops) {
5305     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
5306     mov(rax, super_klass);
5307   }
5308   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
5309   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
5310 
5311 #ifndef PRODUCT
5312   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
5313   ExternalAddress pst_counter_addr((address) pst_counter);
5314   NOT_LP64(  incrementl(pst_counter_addr) );
5315   LP64_ONLY( lea(rcx, pst_counter_addr) );
5316   LP64_ONLY( incrementl(Address(rcx, 0)) );
5317 #endif //PRODUCT
5318 
5319   // We will consult the secondary-super array.
5320   movptr(rdi, secondary_supers_addr);
5321   // Load the array length.  (Positive movl does right thing on LP64.)
5322   movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
5323   // Skip to start of data.
5324   addptr(rdi, Array<Klass*>::base_offset_in_bytes());
5325 
5326   // Scan RCX words at [RDI] for an occurrence of RAX.
5327   // Set NZ/Z based on last compare.
5328   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
5329   // not change flags (only scas instruction which is repeated sets flags).
5330   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
5331 
5332     testptr(rax,rax); // Set Z = 0
5333     repne_scan();
5334 
5335   // Unspill the temp. registers:
5336   if (pushed_rdi)  pop(rdi);
5337   if (pushed_rcx)  pop(rcx);
5338   if (pushed_rax)  pop(rax);
5339 
5340   if (set_cond_codes) {
5341     // Special hack for the AD files:  rdi is guaranteed non-zero.
5342     assert(!pushed_rdi, "rdi must be left non-NULL");
5343     // Also, the condition codes are properly set Z/NZ on succeed/failure.
5344   }
5345 
5346   if (L_failure == &L_fallthrough)
5347         jccb(Assembler::notEqual, *L_failure);
5348   else  jcc(Assembler::notEqual, *L_failure);
5349 
5350   // Success.  Cache the super we found and proceed in triumph.
5351   movptr(super_cache_addr, super_klass);
5352 
5353   if (L_success != &L_fallthrough) {
5354     jmp(*L_success);
5355   }
5356 
5357 #undef IS_A_TEMP
5358 
5359   bind(L_fallthrough);
5360 }
5361 
5362 
5363 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
5364   if (VM_Version::supports_cmov()) {
5365     cmovl(cc, dst, src);
5366   } else {
5367     Label L;
5368     jccb(negate_condition(cc), L);
5369     movl(dst, src);
5370     bind(L);
5371   }
5372 }
5373 
5374 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
5375   if (VM_Version::supports_cmov()) {
5376     cmovl(cc, dst, src);
5377   } else {
5378     Label L;
5379     jccb(negate_condition(cc), L);
5380     movl(dst, src);
5381     bind(L);
5382   }
5383 }
5384 
5385 void MacroAssembler::verify_oop(Register reg, const char* s) {
5386   if (!VerifyOops) return;
5387 
5388   // Pass register number to verify_oop_subroutine
5389   const char* b = NULL;
5390   {
5391     ResourceMark rm;
5392     stringStream ss;
5393     ss.print("verify_oop: %s: %s", reg->name(), s);
5394     b = code_string(ss.as_string());
5395   }
5396   BLOCK_COMMENT("verify_oop {");
5397 #ifdef _LP64
5398   push(rscratch1);                    // save r10, trashed by movptr()
5399 #endif
5400   push(rax);                          // save rax,
5401   push(reg);                          // pass register argument
5402   ExternalAddress buffer((address) b);
5403   // avoid using pushptr, as it modifies scratch registers
5404   // and our contract is not to modify anything
5405   movptr(rax, buffer.addr());
5406   push(rax);
5407   // call indirectly to solve generation ordering problem
5408   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
5409   call(rax);
5410   // Caller pops the arguments (oop, message) and restores rax, r10
5411   BLOCK_COMMENT("} verify_oop");
5412 }
5413 
5414 
5415 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
5416                                                       Register tmp,
5417                                                       int offset) {
5418   intptr_t value = *delayed_value_addr;
5419   if (value != 0)
5420     return RegisterOrConstant(value + offset);
5421 
5422   // load indirectly to solve generation ordering problem
5423   movptr(tmp, ExternalAddress((address) delayed_value_addr));
5424 
5425 #ifdef ASSERT
5426   { Label L;
5427     testptr(tmp, tmp);
5428     if (WizardMode) {
5429       const char* buf = NULL;
5430       {
5431         ResourceMark rm;
5432         stringStream ss;
5433         ss.print("DelayedValue=" INTPTR_FORMAT, delayed_value_addr[1]);
5434         buf = code_string(ss.as_string());
5435       }
5436       jcc(Assembler::notZero, L);
5437       STOP(buf);
5438     } else {
5439       jccb(Assembler::notZero, L);
5440       hlt();
5441     }
5442     bind(L);
5443   }
5444 #endif
5445 
5446   if (offset != 0)
5447     addptr(tmp, offset);
5448 
5449   return RegisterOrConstant(tmp);
5450 }
5451 
5452 
5453 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
5454                                          int extra_slot_offset) {
5455   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
5456   int stackElementSize = Interpreter::stackElementSize;
5457   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
5458 #ifdef ASSERT
5459   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
5460   assert(offset1 - offset == stackElementSize, "correct arithmetic");
5461 #endif
5462   Register             scale_reg    = noreg;
5463   Address::ScaleFactor scale_factor = Address::no_scale;
5464   if (arg_slot.is_constant()) {
5465     offset += arg_slot.as_constant() * stackElementSize;
5466   } else {
5467     scale_reg    = arg_slot.as_register();
5468     scale_factor = Address::times(stackElementSize);
5469   }
5470   offset += wordSize;           // return PC is on stack
5471   return Address(rsp, scale_reg, scale_factor, offset);
5472 }
5473 
5474 
5475 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
5476   if (!VerifyOops) return;
5477 
5478   // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
5479   // Pass register number to verify_oop_subroutine
5480   const char* b = NULL;
5481   {
5482     ResourceMark rm;
5483     stringStream ss;
5484     ss.print("verify_oop_addr: %s", s);
5485     b = code_string(ss.as_string());
5486   }
5487 #ifdef _LP64
5488   push(rscratch1);                    // save r10, trashed by movptr()
5489 #endif
5490   push(rax);                          // save rax,
5491   // addr may contain rsp so we will have to adjust it based on the push
5492   // we just did (and on 64 bit we do two pushes)
5493   // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
5494   // stores rax into addr which is backwards of what was intended.
5495   if (addr.uses(rsp)) {
5496     lea(rax, addr);
5497     pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
5498   } else {
5499     pushptr(addr);
5500   }
5501 
5502   ExternalAddress buffer((address) b);
5503   // pass msg argument
5504   // avoid using pushptr, as it modifies scratch registers
5505   // and our contract is not to modify anything
5506   movptr(rax, buffer.addr());
5507   push(rax);
5508 
5509   // call indirectly to solve generation ordering problem
5510   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
5511   call(rax);
5512   // Caller pops the arguments (addr, message) and restores rax, r10.
5513 }
5514 
5515 void MacroAssembler::verify_tlab() {
5516 #ifdef ASSERT
5517   if (UseTLAB && VerifyOops) {
5518     Label next, ok;
5519     Register t1 = rsi;
5520     Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
5521 
5522     push(t1);
5523     NOT_LP64(push(thread_reg));
5524     NOT_LP64(get_thread(thread_reg));
5525 
5526     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
5527     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
5528     jcc(Assembler::aboveEqual, next);
5529     STOP("assert(top >= start)");
5530     should_not_reach_here();
5531 
5532     bind(next);
5533     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
5534     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
5535     jcc(Assembler::aboveEqual, ok);
5536     STOP("assert(top <= end)");
5537     should_not_reach_here();
5538 
5539     bind(ok);
5540     NOT_LP64(pop(thread_reg));
5541     pop(t1);
5542   }
5543 #endif
5544 }
5545 
5546 class ControlWord {
5547  public:
5548   int32_t _value;
5549 
5550   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
5551   int  precision_control() const       { return  (_value >>  8) & 3      ; }
5552   bool precision() const               { return ((_value >>  5) & 1) != 0; }
5553   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
5554   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
5555   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
5556   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
5557   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
5558 
5559   void print() const {
5560     // rounding control
5561     const char* rc;
5562     switch (rounding_control()) {
5563       case 0: rc = "round near"; break;
5564       case 1: rc = "round down"; break;
5565       case 2: rc = "round up  "; break;
5566       case 3: rc = "chop      "; break;
5567     };
5568     // precision control
5569     const char* pc;
5570     switch (precision_control()) {
5571       case 0: pc = "24 bits "; break;
5572       case 1: pc = "reserved"; break;
5573       case 2: pc = "53 bits "; break;
5574       case 3: pc = "64 bits "; break;
5575     };
5576     // flags
5577     char f[9];
5578     f[0] = ' ';
5579     f[1] = ' ';
5580     f[2] = (precision   ()) ? 'P' : 'p';
5581     f[3] = (underflow   ()) ? 'U' : 'u';
5582     f[4] = (overflow    ()) ? 'O' : 'o';
5583     f[5] = (zero_divide ()) ? 'Z' : 'z';
5584     f[6] = (denormalized()) ? 'D' : 'd';
5585     f[7] = (invalid     ()) ? 'I' : 'i';
5586     f[8] = '\x0';
5587     // output
5588     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
5589   }
5590 
5591 };
5592 
5593 class StatusWord {
5594  public:
5595   int32_t _value;
5596 
5597   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
5598   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
5599   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
5600   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
5601   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
5602   int  top() const                     { return  (_value >> 11) & 7      ; }
5603   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
5604   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
5605   bool precision() const               { return ((_value >>  5) & 1) != 0; }
5606   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
5607   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
5608   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
5609   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
5610   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
5611 
5612   void print() const {
5613     // condition codes
5614     char c[5];
5615     c[0] = (C3()) ? '3' : '-';
5616     c[1] = (C2()) ? '2' : '-';
5617     c[2] = (C1()) ? '1' : '-';
5618     c[3] = (C0()) ? '0' : '-';
5619     c[4] = '\x0';
5620     // flags
5621     char f[9];
5622     f[0] = (error_status()) ? 'E' : '-';
5623     f[1] = (stack_fault ()) ? 'S' : '-';
5624     f[2] = (precision   ()) ? 'P' : '-';
5625     f[3] = (underflow   ()) ? 'U' : '-';
5626     f[4] = (overflow    ()) ? 'O' : '-';
5627     f[5] = (zero_divide ()) ? 'Z' : '-';
5628     f[6] = (denormalized()) ? 'D' : '-';
5629     f[7] = (invalid     ()) ? 'I' : '-';
5630     f[8] = '\x0';
5631     // output
5632     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
5633   }
5634 
5635 };
5636 
5637 class TagWord {
5638  public:
5639   int32_t _value;
5640 
5641   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
5642 
5643   void print() const {
5644     printf("%04x", _value & 0xFFFF);
5645   }
5646 
5647 };
5648 
5649 class FPU_Register {
5650  public:
5651   int32_t _m0;
5652   int32_t _m1;
5653   int16_t _ex;
5654 
5655   bool is_indefinite() const           {
5656     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
5657   }
5658 
5659   void print() const {
5660     char  sign = (_ex < 0) ? '-' : '+';
5661     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
5662     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
5663   };
5664 
5665 };
5666 
5667 class FPU_State {
5668  public:
5669   enum {
5670     register_size       = 10,
5671     number_of_registers =  8,
5672     register_mask       =  7
5673   };
5674 
5675   ControlWord  _control_word;
5676   StatusWord   _status_word;
5677   TagWord      _tag_word;
5678   int32_t      _error_offset;
5679   int32_t      _error_selector;
5680   int32_t      _data_offset;
5681   int32_t      _data_selector;
5682   int8_t       _register[register_size * number_of_registers];
5683 
5684   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
5685   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
5686 
5687   const char* tag_as_string(int tag) const {
5688     switch (tag) {
5689       case 0: return "valid";
5690       case 1: return "zero";
5691       case 2: return "special";
5692       case 3: return "empty";
5693     }
5694     ShouldNotReachHere();
5695     return NULL;
5696   }
5697 
5698   void print() const {
5699     // print computation registers
5700     { int t = _status_word.top();
5701       for (int i = 0; i < number_of_registers; i++) {
5702         int j = (i - t) & register_mask;
5703         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
5704         st(j)->print();
5705         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
5706       }
5707     }
5708     printf("\n");
5709     // print control registers
5710     printf("ctrl = "); _control_word.print(); printf("\n");
5711     printf("stat = "); _status_word .print(); printf("\n");
5712     printf("tags = "); _tag_word    .print(); printf("\n");
5713   }
5714 
5715 };
5716 
5717 class Flag_Register {
5718  public:
5719   int32_t _value;
5720 
5721   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
5722   bool direction() const               { return ((_value >> 10) & 1) != 0; }
5723   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
5724   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
5725   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
5726   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
5727   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
5728 
5729   void print() const {
5730     // flags
5731     char f[8];
5732     f[0] = (overflow       ()) ? 'O' : '-';
5733     f[1] = (direction      ()) ? 'D' : '-';
5734     f[2] = (sign           ()) ? 'S' : '-';
5735     f[3] = (zero           ()) ? 'Z' : '-';
5736     f[4] = (auxiliary_carry()) ? 'A' : '-';
5737     f[5] = (parity         ()) ? 'P' : '-';
5738     f[6] = (carry          ()) ? 'C' : '-';
5739     f[7] = '\x0';
5740     // output
5741     printf("%08x  flags = %s", _value, f);
5742   }
5743 
5744 };
5745 
5746 class IU_Register {
5747  public:
5748   int32_t _value;
5749 
5750   void print() const {
5751     printf("%08x  %11d", _value, _value);
5752   }
5753 
5754 };
5755 
5756 class IU_State {
5757  public:
5758   Flag_Register _eflags;
5759   IU_Register   _rdi;
5760   IU_Register   _rsi;
5761   IU_Register   _rbp;
5762   IU_Register   _rsp;
5763   IU_Register   _rbx;
5764   IU_Register   _rdx;
5765   IU_Register   _rcx;
5766   IU_Register   _rax;
5767 
5768   void print() const {
5769     // computation registers
5770     printf("rax,  = "); _rax.print(); printf("\n");
5771     printf("rbx,  = "); _rbx.print(); printf("\n");
5772     printf("rcx  = "); _rcx.print(); printf("\n");
5773     printf("rdx  = "); _rdx.print(); printf("\n");
5774     printf("rdi  = "); _rdi.print(); printf("\n");
5775     printf("rsi  = "); _rsi.print(); printf("\n");
5776     printf("rbp,  = "); _rbp.print(); printf("\n");
5777     printf("rsp  = "); _rsp.print(); printf("\n");
5778     printf("\n");
5779     // control registers
5780     printf("flgs = "); _eflags.print(); printf("\n");
5781   }
5782 };
5783 
5784 
5785 class CPU_State {
5786  public:
5787   FPU_State _fpu_state;
5788   IU_State  _iu_state;
5789 
5790   void print() const {
5791     printf("--------------------------------------------------\n");
5792     _iu_state .print();
5793     printf("\n");
5794     _fpu_state.print();
5795     printf("--------------------------------------------------\n");
5796   }
5797 
5798 };
5799 
5800 
5801 static void _print_CPU_state(CPU_State* state) {
5802   state->print();
5803 };
5804 
5805 
5806 void MacroAssembler::print_CPU_state() {
5807   push_CPU_state();
5808   push(rsp);                // pass CPU state
5809   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
5810   addptr(rsp, wordSize);       // discard argument
5811   pop_CPU_state();
5812 }
5813 
5814 
5815 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
5816   static int counter = 0;
5817   FPU_State* fs = &state->_fpu_state;
5818   counter++;
5819   // For leaf calls, only verify that the top few elements remain empty.
5820   // We only need 1 empty at the top for C2 code.
5821   if( stack_depth < 0 ) {
5822     if( fs->tag_for_st(7) != 3 ) {
5823       printf("FPR7 not empty\n");
5824       state->print();
5825       assert(false, "error");
5826       return false;
5827     }
5828     return true;                // All other stack states do not matter
5829   }
5830 
5831   assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
5832          "bad FPU control word");
5833 
5834   // compute stack depth
5835   int i = 0;
5836   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
5837   int d = i;
5838   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
5839   // verify findings
5840   if (i != FPU_State::number_of_registers) {
5841     // stack not contiguous
5842     printf("%s: stack not contiguous at ST%d\n", s, i);
5843     state->print();
5844     assert(false, "error");
5845     return false;
5846   }
5847   // check if computed stack depth corresponds to expected stack depth
5848   if (stack_depth < 0) {
5849     // expected stack depth is -stack_depth or less
5850     if (d > -stack_depth) {
5851       // too many elements on the stack
5852       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
5853       state->print();
5854       assert(false, "error");
5855       return false;
5856     }
5857   } else {
5858     // expected stack depth is stack_depth
5859     if (d != stack_depth) {
5860       // wrong stack depth
5861       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
5862       state->print();
5863       assert(false, "error");
5864       return false;
5865     }
5866   }
5867   // everything is cool
5868   return true;
5869 }
5870 
5871 
5872 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
5873   if (!VerifyFPU) return;
5874   push_CPU_state();
5875   push(rsp);                // pass CPU state
5876   ExternalAddress msg((address) s);
5877   // pass message string s
5878   pushptr(msg.addr());
5879   push(stack_depth);        // pass stack depth
5880   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
5881   addptr(rsp, 3 * wordSize);   // discard arguments
5882   // check for error
5883   { Label L;
5884     testl(rax, rax);
5885     jcc(Assembler::notZero, L);
5886     int3();                  // break if error condition
5887     bind(L);
5888   }
5889   pop_CPU_state();
5890 }
5891 
5892 void MacroAssembler::restore_cpu_control_state_after_jni() {
5893   // Either restore the MXCSR register after returning from the JNI Call
5894   // or verify that it wasn't changed (with -Xcheck:jni flag).
5895   if (VM_Version::supports_sse()) {
5896     if (RestoreMXCSROnJNICalls) {
5897       ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
5898     } else if (CheckJNICalls) {
5899       call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
5900     }
5901   }
5902   // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
5903   vzeroupper();
5904   // Reset k1 to 0xffff.
5905   if (VM_Version::supports_evex()) {
5906     push(rcx);
5907     movl(rcx, 0xffff);
5908     kmovwl(k1, rcx);
5909     pop(rcx);
5910   }
5911 
5912 #ifndef _LP64
5913   // Either restore the x87 floating pointer control word after returning
5914   // from the JNI call or verify that it wasn't changed.
5915   if (CheckJNICalls) {
5916     call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
5917   }
5918 #endif // _LP64
5919 }
5920 
5921 // ((OopHandle)result).resolve();
5922 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
5923   assert_different_registers(result, tmp);
5924 
5925   // Only 64 bit platforms support GCs that require a tmp register
5926   // Only IN_HEAP loads require a thread_tmp register
5927   // OopHandle::resolve is an indirection like jobject.
5928   access_load_at(T_OBJECT, IN_NATIVE,
5929                  result, Address(result, 0), tmp, /*tmp_thread*/noreg);
5930 }
5931 
5932 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
5933   // get mirror
5934   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
5935   movptr(mirror, Address(method, Method::const_offset()));
5936   movptr(mirror, Address(mirror, ConstMethod::constants_offset()));
5937   movptr(mirror, Address(mirror, ConstantPool::pool_holder_offset_in_bytes()));
5938   movptr(mirror, Address(mirror, mirror_offset));
5939   resolve_oop_handle(mirror, tmp);
5940 }
5941 
5942 void MacroAssembler::load_klass(Register dst, Register src) {
5943 #ifdef _LP64
5944   if (UseCompressedClassPointers) {
5945     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5946     decode_klass_not_null(dst);
5947   } else
5948 #endif
5949     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5950 }
5951 
5952 void MacroAssembler::load_prototype_header(Register dst, Register src) {
5953   load_klass(dst, src);
5954   movptr(dst, Address(dst, Klass::prototype_header_offset()));
5955 }
5956 
5957 void MacroAssembler::store_klass(Register dst, Register src) {
5958 #ifdef _LP64
5959   if (UseCompressedClassPointers) {
5960     encode_klass_not_null(src);
5961     movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5962   } else
5963 #endif
5964     movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5965 }
5966 
5967 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
5968                                     Register tmp1, Register thread_tmp) {
5969   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5970   decorators = AccessInternal::decorator_fixup(decorators);
5971   bool as_raw = (decorators & AS_RAW) != 0;
5972   if (as_raw) {
5973     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
5974   } else {
5975     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
5976   }
5977 }
5978 
5979 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src,
5980                                      Register tmp1, Register tmp2) {
5981   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5982   decorators = AccessInternal::decorator_fixup(decorators);
5983   bool as_raw = (decorators & AS_RAW) != 0;
5984   if (as_raw) {
5985     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, tmp2);
5986   } else {
5987     bs->store_at(this, decorators, type, dst, src, tmp1, tmp2);
5988   }
5989 }
5990 
5991 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
5992   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
5993   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
5994     decorators |= ACCESS_READ | ACCESS_WRITE;
5995   }
5996   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5997   return bs->resolve(this, decorators, obj);
5998 }
5999 
6000 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
6001                                    Register thread_tmp, DecoratorSet decorators) {
6002   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
6003 }
6004 
6005 // Doesn't do verfication, generates fixed size code
6006 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
6007                                             Register thread_tmp, DecoratorSet decorators) {
6008   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
6009 }
6010 
6011 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
6012                                     Register tmp2, DecoratorSet decorators) {
6013   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
6014 }
6015 
6016 // Used for storing NULLs.
6017 void MacroAssembler::store_heap_oop_null(Address dst) {
6018   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
6019 }
6020 
6021 #ifdef _LP64
6022 void MacroAssembler::store_klass_gap(Register dst, Register src) {
6023   if (UseCompressedClassPointers) {
6024     // Store to klass gap in destination
6025     movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
6026   }
6027 }
6028 
6029 #ifdef ASSERT
6030 void MacroAssembler::verify_heapbase(const char* msg) {
6031   assert (UseCompressedOops, "should be compressed");
6032   assert (Universe::heap() != NULL, "java heap should be initialized");
6033   if (CheckCompressedOops) {
6034     Label ok;
6035     push(rscratch1); // cmpptr trashes rscratch1
6036     cmpptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
6037     jcc(Assembler::equal, ok);
6038     STOP(msg);
6039     bind(ok);
6040     pop(rscratch1);
6041   }
6042 }
6043 #endif
6044 
6045 // Algorithm must match oop.inline.hpp encode_heap_oop.
6046 void MacroAssembler::encode_heap_oop(Register r) {
6047 #ifdef ASSERT
6048   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
6049 #endif
6050   verify_oop(r, "broken oop in encode_heap_oop");
6051   if (Universe::narrow_oop_base() == NULL) {
6052     if (Universe::narrow_oop_shift() != 0) {
6053       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
6054       shrq(r, LogMinObjAlignmentInBytes);
6055     }
6056     return;
6057   }
6058   testq(r, r);
6059   cmovq(Assembler::equal, r, r12_heapbase);
6060   subq(r, r12_heapbase);
6061   shrq(r, LogMinObjAlignmentInBytes);
6062 }
6063 
6064 void MacroAssembler::encode_heap_oop_not_null(Register r) {
6065 #ifdef ASSERT
6066   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
6067   if (CheckCompressedOops) {
6068     Label ok;
6069     testq(r, r);
6070     jcc(Assembler::notEqual, ok);
6071     STOP("null oop passed to encode_heap_oop_not_null");
6072     bind(ok);
6073   }
6074 #endif
6075   verify_oop(r, "broken oop in encode_heap_oop_not_null");
6076   if (Universe::narrow_oop_base() != NULL) {
6077     subq(r, r12_heapbase);
6078   }
6079   if (Universe::narrow_oop_shift() != 0) {
6080     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
6081     shrq(r, LogMinObjAlignmentInBytes);
6082   }
6083 }
6084 
6085 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
6086 #ifdef ASSERT
6087   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
6088   if (CheckCompressedOops) {
6089     Label ok;
6090     testq(src, src);
6091     jcc(Assembler::notEqual, ok);
6092     STOP("null oop passed to encode_heap_oop_not_null2");
6093     bind(ok);
6094   }
6095 #endif
6096   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
6097   if (dst != src) {
6098     movq(dst, src);
6099   }
6100   if (Universe::narrow_oop_base() != NULL) {
6101     subq(dst, r12_heapbase);
6102   }
6103   if (Universe::narrow_oop_shift() != 0) {
6104     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
6105     shrq(dst, LogMinObjAlignmentInBytes);
6106   }
6107 }
6108 
6109 void  MacroAssembler::decode_heap_oop(Register r) {
6110 #ifdef ASSERT
6111   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
6112 #endif
6113   if (Universe::narrow_oop_base() == NULL) {
6114     if (Universe::narrow_oop_shift() != 0) {
6115       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
6116       shlq(r, LogMinObjAlignmentInBytes);
6117     }
6118   } else {
6119     Label done;
6120     shlq(r, LogMinObjAlignmentInBytes);
6121     jccb(Assembler::equal, done);
6122     addq(r, r12_heapbase);
6123     bind(done);
6124   }
6125   verify_oop(r, "broken oop in decode_heap_oop");
6126 }
6127 
6128 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
6129   // Note: it will change flags
6130   assert (UseCompressedOops, "should only be used for compressed headers");
6131   assert (Universe::heap() != NULL, "java heap should be initialized");
6132   // Cannot assert, unverified entry point counts instructions (see .ad file)
6133   // vtableStubs also counts instructions in pd_code_size_limit.
6134   // Also do not verify_oop as this is called by verify_oop.
6135   if (Universe::narrow_oop_shift() != 0) {
6136     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
6137     shlq(r, LogMinObjAlignmentInBytes);
6138     if (Universe::narrow_oop_base() != NULL) {
6139       addq(r, r12_heapbase);
6140     }
6141   } else {
6142     assert (Universe::narrow_oop_base() == NULL, "sanity");
6143   }
6144 }
6145 
6146 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
6147   // Note: it will change flags
6148   assert (UseCompressedOops, "should only be used for compressed headers");
6149   assert (Universe::heap() != NULL, "java heap should be initialized");
6150   // Cannot assert, unverified entry point counts instructions (see .ad file)
6151   // vtableStubs also counts instructions in pd_code_size_limit.
6152   // Also do not verify_oop as this is called by verify_oop.
6153   if (Universe::narrow_oop_shift() != 0) {
6154     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
6155     if (LogMinObjAlignmentInBytes == Address::times_8) {
6156       leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
6157     } else {
6158       if (dst != src) {
6159         movq(dst, src);
6160       }
6161       shlq(dst, LogMinObjAlignmentInBytes);
6162       if (Universe::narrow_oop_base() != NULL) {
6163         addq(dst, r12_heapbase);
6164       }
6165     }
6166   } else {
6167     assert (Universe::narrow_oop_base() == NULL, "sanity");
6168     if (dst != src) {
6169       movq(dst, src);
6170     }
6171   }
6172 }
6173 
6174 void MacroAssembler::encode_klass_not_null(Register r) {
6175   if (Universe::narrow_klass_base() != NULL) {
6176     // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
6177     assert(r != r12_heapbase, "Encoding a klass in r12");
6178     mov64(r12_heapbase, (int64_t)Universe::narrow_klass_base());
6179     subq(r, r12_heapbase);
6180   }
6181   if (Universe::narrow_klass_shift() != 0) {
6182     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
6183     shrq(r, LogKlassAlignmentInBytes);
6184   }
6185   if (Universe::narrow_klass_base() != NULL) {
6186     reinit_heapbase();
6187   }
6188 }
6189 
6190 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
6191   if (dst == src) {
6192     encode_klass_not_null(src);
6193   } else {
6194     if (Universe::narrow_klass_base() != NULL) {
6195       mov64(dst, (int64_t)Universe::narrow_klass_base());
6196       negq(dst);
6197       addq(dst, src);
6198     } else {
6199       movptr(dst, src);
6200     }
6201     if (Universe::narrow_klass_shift() != 0) {
6202       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
6203       shrq(dst, LogKlassAlignmentInBytes);
6204     }
6205   }
6206 }
6207 
6208 // Function instr_size_for_decode_klass_not_null() counts the instructions
6209 // generated by decode_klass_not_null(register r) and reinit_heapbase(),
6210 // when (Universe::heap() != NULL).  Hence, if the instructions they
6211 // generate change, then this method needs to be updated.
6212 int MacroAssembler::instr_size_for_decode_klass_not_null() {
6213   assert (UseCompressedClassPointers, "only for compressed klass ptrs");
6214   if (Universe::narrow_klass_base() != NULL) {
6215     // mov64 + addq + shlq? + mov64  (for reinit_heapbase()).
6216     return (Universe::narrow_klass_shift() == 0 ? 20 : 24);
6217   } else {
6218     // longest load decode klass function, mov64, leaq
6219     return 16;
6220   }
6221 }
6222 
6223 // !!! If the instructions that get generated here change then function
6224 // instr_size_for_decode_klass_not_null() needs to get updated.
6225 void  MacroAssembler::decode_klass_not_null(Register r) {
6226   // Note: it will change flags
6227   assert (UseCompressedClassPointers, "should only be used for compressed headers");
6228   assert(r != r12_heapbase, "Decoding a klass in r12");
6229   // Cannot assert, unverified entry point counts instructions (see .ad file)
6230   // vtableStubs also counts instructions in pd_code_size_limit.
6231   // Also do not verify_oop as this is called by verify_oop.
6232   if (Universe::narrow_klass_shift() != 0) {
6233     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
6234     shlq(r, LogKlassAlignmentInBytes);
6235   }
6236   // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
6237   if (Universe::narrow_klass_base() != NULL) {
6238     mov64(r12_heapbase, (int64_t)Universe::narrow_klass_base());
6239     addq(r, r12_heapbase);
6240     reinit_heapbase();
6241   }
6242 }
6243 
6244 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
6245   // Note: it will change flags
6246   assert (UseCompressedClassPointers, "should only be used for compressed headers");
6247   if (dst == src) {
6248     decode_klass_not_null(dst);
6249   } else {
6250     // Cannot assert, unverified entry point counts instructions (see .ad file)
6251     // vtableStubs also counts instructions in pd_code_size_limit.
6252     // Also do not verify_oop as this is called by verify_oop.
6253     mov64(dst, (int64_t)Universe::narrow_klass_base());
6254     if (Universe::narrow_klass_shift() != 0) {
6255       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
6256       assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
6257       leaq(dst, Address(dst, src, Address::times_8, 0));
6258     } else {
6259       addq(dst, src);
6260     }
6261   }
6262 }
6263 
6264 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
6265   assert (UseCompressedOops, "should only be used for compressed headers");
6266   assert (Universe::heap() != NULL, "java heap should be initialized");
6267   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6268   int oop_index = oop_recorder()->find_index(obj);
6269   RelocationHolder rspec = oop_Relocation::spec(oop_index);
6270   mov_narrow_oop(dst, oop_index, rspec);
6271 }
6272 
6273 void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
6274   assert (UseCompressedOops, "should only be used for compressed headers");
6275   assert (Universe::heap() != NULL, "java heap should be initialized");
6276   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6277   int oop_index = oop_recorder()->find_index(obj);
6278   RelocationHolder rspec = oop_Relocation::spec(oop_index);
6279   mov_narrow_oop(dst, oop_index, rspec);
6280 }
6281 
6282 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
6283   assert (UseCompressedClassPointers, "should only be used for compressed headers");
6284   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6285   int klass_index = oop_recorder()->find_index(k);
6286   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6287   mov_narrow_oop(dst, Klass::encode_klass(k), rspec);
6288 }
6289 
6290 void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
6291   assert (UseCompressedClassPointers, "should only be used for compressed headers");
6292   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6293   int klass_index = oop_recorder()->find_index(k);
6294   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6295   mov_narrow_oop(dst, Klass::encode_klass(k), rspec);
6296 }
6297 
6298 void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
6299   assert (UseCompressedOops, "should only be used for compressed headers");
6300   assert (Universe::heap() != NULL, "java heap should be initialized");
6301   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6302   int oop_index = oop_recorder()->find_index(obj);
6303   RelocationHolder rspec = oop_Relocation::spec(oop_index);
6304   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
6305 }
6306 
6307 void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
6308   assert (UseCompressedOops, "should only be used for compressed headers");
6309   assert (Universe::heap() != NULL, "java heap should be initialized");
6310   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6311   int oop_index = oop_recorder()->find_index(obj);
6312   RelocationHolder rspec = oop_Relocation::spec(oop_index);
6313   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
6314 }
6315 
6316 void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
6317   assert (UseCompressedClassPointers, "should only be used for compressed headers");
6318   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6319   int klass_index = oop_recorder()->find_index(k);
6320   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6321   Assembler::cmp_narrow_oop(dst, Klass::encode_klass(k), rspec);
6322 }
6323 
6324 void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
6325   assert (UseCompressedClassPointers, "should only be used for compressed headers");
6326   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6327   int klass_index = oop_recorder()->find_index(k);
6328   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6329   Assembler::cmp_narrow_oop(dst, Klass::encode_klass(k), rspec);
6330 }
6331 
6332 void MacroAssembler::reinit_heapbase() {
6333   if (UseCompressedOops || UseCompressedClassPointers) {
6334     if (Universe::heap() != NULL) {
6335       if (Universe::narrow_oop_base() == NULL) {
6336         MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
6337       } else {
6338         mov64(r12_heapbase, (int64_t)Universe::narrow_ptrs_base());
6339       }
6340     } else {
6341       movptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
6342     }
6343   }
6344 }
6345 
6346 #endif // _LP64
6347 
6348 // C2 compiled method's prolog code.
6349 void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b) {
6350 
6351   // WARNING: Initial instruction MUST be 5 bytes or longer so that
6352   // NativeJump::patch_verified_entry will be able to patch out the entry
6353   // code safely. The push to verify stack depth is ok at 5 bytes,
6354   // the frame allocation can be either 3 or 6 bytes. So if we don't do
6355   // stack bang then we must use the 6 byte frame allocation even if
6356   // we have no frame. :-(
6357   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
6358 
6359   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
6360   // Remove word for return addr
6361   framesize -= wordSize;
6362   stack_bang_size -= wordSize;
6363 
6364   // Calls to C2R adapters often do not accept exceptional returns.
6365   // We require that their callers must bang for them.  But be careful, because
6366   // some VM calls (such as call site linkage) can use several kilobytes of
6367   // stack.  But the stack safety zone should account for that.
6368   // See bugs 4446381, 4468289, 4497237.
6369   if (stack_bang_size > 0) {
6370     generate_stack_overflow_check(stack_bang_size);
6371 
6372     // We always push rbp, so that on return to interpreter rbp, will be
6373     // restored correctly and we can correct the stack.
6374     push(rbp);
6375     // Save caller's stack pointer into RBP if the frame pointer is preserved.
6376     if (PreserveFramePointer) {
6377       mov(rbp, rsp);
6378     }
6379     // Remove word for ebp
6380     framesize -= wordSize;
6381 
6382     // Create frame
6383     if (framesize) {
6384       subptr(rsp, framesize);
6385     }
6386   } else {
6387     // Create frame (force generation of a 4 byte immediate value)
6388     subptr_imm32(rsp, framesize);
6389 
6390     // Save RBP register now.
6391     framesize -= wordSize;
6392     movptr(Address(rsp, framesize), rbp);
6393     // Save caller's stack pointer into RBP if the frame pointer is preserved.
6394     if (PreserveFramePointer) {
6395       movptr(rbp, rsp);
6396       if (framesize > 0) {
6397         addptr(rbp, framesize);
6398       }
6399     }
6400   }
6401 
6402   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
6403     framesize -= wordSize;
6404     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
6405   }
6406 
6407 #ifndef _LP64
6408   // If method sets FPU control word do it now
6409   if (fp_mode_24b) {
6410     fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
6411   }
6412   if (UseSSE >= 2 && VerifyFPU) {
6413     verify_FPU(0, "FPU stack must be clean on entry");
6414   }
6415 #endif
6416 
6417 #ifdef ASSERT
6418   if (VerifyStackAtCalls) {
6419     Label L;
6420     push(rax);
6421     mov(rax, rsp);
6422     andptr(rax, StackAlignmentInBytes-1);
6423     cmpptr(rax, StackAlignmentInBytes-wordSize);
6424     pop(rax);
6425     jcc(Assembler::equal, L);
6426     STOP("Stack is not properly aligned!");
6427     bind(L);
6428   }
6429 #endif
6430 
6431 }
6432 
6433 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers
6434 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, XMMRegister xtmp) {
6435   // cnt - number of qwords (8-byte words).
6436   // base - start address, qword aligned.
6437   Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
6438   if (UseAVX >= 2) {
6439     vpxor(xtmp, xtmp, xtmp, AVX_256bit);
6440   } else {
6441     pxor(xtmp, xtmp);
6442   }
6443   jmp(L_zero_64_bytes);
6444 
6445   BIND(L_loop);
6446   if (UseAVX >= 2) {
6447     vmovdqu(Address(base,  0), xtmp);
6448     vmovdqu(Address(base, 32), xtmp);
6449   } else {
6450     movdqu(Address(base,  0), xtmp);
6451     movdqu(Address(base, 16), xtmp);
6452     movdqu(Address(base, 32), xtmp);
6453     movdqu(Address(base, 48), xtmp);
6454   }
6455   addptr(base, 64);
6456 
6457   BIND(L_zero_64_bytes);
6458   subptr(cnt, 8);
6459   jccb(Assembler::greaterEqual, L_loop);
6460   addptr(cnt, 4);
6461   jccb(Assembler::less, L_tail);
6462   // Copy trailing 32 bytes
6463   if (UseAVX >= 2) {
6464     vmovdqu(Address(base, 0), xtmp);
6465   } else {
6466     movdqu(Address(base,  0), xtmp);
6467     movdqu(Address(base, 16), xtmp);
6468   }
6469   addptr(base, 32);
6470   subptr(cnt, 4);
6471 
6472   BIND(L_tail);
6473   addptr(cnt, 4);
6474   jccb(Assembler::lessEqual, L_end);
6475   decrement(cnt);
6476 
6477   BIND(L_sloop);
6478   movq(Address(base, 0), xtmp);
6479   addptr(base, 8);
6480   decrement(cnt);
6481   jccb(Assembler::greaterEqual, L_sloop);
6482   BIND(L_end);
6483 }
6484 
6485 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp, bool is_large) {
6486   // cnt - number of qwords (8-byte words).
6487   // base - start address, qword aligned.
6488   // is_large - if optimizers know cnt is larger than InitArrayShortSize
6489   assert(base==rdi, "base register must be edi for rep stos");
6490   assert(tmp==rax,   "tmp register must be eax for rep stos");
6491   assert(cnt==rcx,   "cnt register must be ecx for rep stos");
6492   assert(InitArrayShortSize % BytesPerLong == 0,
6493     "InitArrayShortSize should be the multiple of BytesPerLong");
6494 
6495   Label DONE;
6496 
6497   if (!is_large || !UseXMMForObjInit) {
6498     xorptr(tmp, tmp);
6499   }
6500 
6501   if (!is_large) {
6502     Label LOOP, LONG;
6503     cmpptr(cnt, InitArrayShortSize/BytesPerLong);
6504     jccb(Assembler::greater, LONG);
6505 
6506     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
6507 
6508     decrement(cnt);
6509     jccb(Assembler::negative, DONE); // Zero length
6510 
6511     // Use individual pointer-sized stores for small counts:
6512     BIND(LOOP);
6513     movptr(Address(base, cnt, Address::times_ptr), tmp);
6514     decrement(cnt);
6515     jccb(Assembler::greaterEqual, LOOP);
6516     jmpb(DONE);
6517 
6518     BIND(LONG);
6519   }
6520 
6521   // Use longer rep-prefixed ops for non-small counts:
6522   if (UseFastStosb) {
6523     shlptr(cnt, 3); // convert to number of bytes
6524     rep_stosb();
6525   } else if (UseXMMForObjInit) {
6526     movptr(tmp, base);
6527     xmm_clear_mem(tmp, cnt, xtmp);
6528   } else {
6529     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
6530     rep_stos();
6531   }
6532 
6533   BIND(DONE);
6534 }
6535 
6536 #ifdef COMPILER2
6537 
6538 // IndexOf for constant substrings with size >= 8 chars
6539 // which don't need to be loaded through stack.
6540 void MacroAssembler::string_indexofC8(Register str1, Register str2,
6541                                       Register cnt1, Register cnt2,
6542                                       int int_cnt2,  Register result,
6543                                       XMMRegister vec, Register tmp,
6544                                       int ae) {
6545   ShortBranchVerifier sbv(this);
6546   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
6547   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
6548 
6549   // This method uses the pcmpestri instruction with bound registers
6550   //   inputs:
6551   //     xmm - substring
6552   //     rax - substring length (elements count)
6553   //     mem - scanned string
6554   //     rdx - string length (elements count)
6555   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
6556   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
6557   //   outputs:
6558   //     rcx - matched index in string
6559   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6560   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
6561   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
6562   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
6563   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
6564 
6565   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
6566         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
6567         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
6568 
6569   // Note, inline_string_indexOf() generates checks:
6570   // if (substr.count > string.count) return -1;
6571   // if (substr.count == 0) return 0;
6572   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
6573 
6574   // Load substring.
6575   if (ae == StrIntrinsicNode::UL) {
6576     pmovzxbw(vec, Address(str2, 0));
6577   } else {
6578     movdqu(vec, Address(str2, 0));
6579   }
6580   movl(cnt2, int_cnt2);
6581   movptr(result, str1); // string addr
6582 
6583   if (int_cnt2 > stride) {
6584     jmpb(SCAN_TO_SUBSTR);
6585 
6586     // Reload substr for rescan, this code
6587     // is executed only for large substrings (> 8 chars)
6588     bind(RELOAD_SUBSTR);
6589     if (ae == StrIntrinsicNode::UL) {
6590       pmovzxbw(vec, Address(str2, 0));
6591     } else {
6592       movdqu(vec, Address(str2, 0));
6593     }
6594     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
6595 
6596     bind(RELOAD_STR);
6597     // We came here after the beginning of the substring was
6598     // matched but the rest of it was not so we need to search
6599     // again. Start from the next element after the previous match.
6600 
6601     // cnt2 is number of substring reminding elements and
6602     // cnt1 is number of string reminding elements when cmp failed.
6603     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
6604     subl(cnt1, cnt2);
6605     addl(cnt1, int_cnt2);
6606     movl(cnt2, int_cnt2); // Now restore cnt2
6607 
6608     decrementl(cnt1);     // Shift to next element
6609     cmpl(cnt1, cnt2);
6610     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6611 
6612     addptr(result, (1<<scale1));
6613 
6614   } // (int_cnt2 > 8)
6615 
6616   // Scan string for start of substr in 16-byte vectors
6617   bind(SCAN_TO_SUBSTR);
6618   pcmpestri(vec, Address(result, 0), mode);
6619   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
6620   subl(cnt1, stride);
6621   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
6622   cmpl(cnt1, cnt2);
6623   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6624   addptr(result, 16);
6625   jmpb(SCAN_TO_SUBSTR);
6626 
6627   // Found a potential substr
6628   bind(FOUND_CANDIDATE);
6629   // Matched whole vector if first element matched (tmp(rcx) == 0).
6630   if (int_cnt2 == stride) {
6631     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
6632   } else { // int_cnt2 > 8
6633     jccb(Assembler::overflow, FOUND_SUBSTR);
6634   }
6635   // After pcmpestri tmp(rcx) contains matched element index
6636   // Compute start addr of substr
6637   lea(result, Address(result, tmp, scale1));
6638 
6639   // Make sure string is still long enough
6640   subl(cnt1, tmp);
6641   cmpl(cnt1, cnt2);
6642   if (int_cnt2 == stride) {
6643     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
6644   } else { // int_cnt2 > 8
6645     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
6646   }
6647   // Left less then substring.
6648 
6649   bind(RET_NOT_FOUND);
6650   movl(result, -1);
6651   jmp(EXIT);
6652 
6653   if (int_cnt2 > stride) {
6654     // This code is optimized for the case when whole substring
6655     // is matched if its head is matched.
6656     bind(MATCH_SUBSTR_HEAD);
6657     pcmpestri(vec, Address(result, 0), mode);
6658     // Reload only string if does not match
6659     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
6660 
6661     Label CONT_SCAN_SUBSTR;
6662     // Compare the rest of substring (> 8 chars).
6663     bind(FOUND_SUBSTR);
6664     // First 8 chars are already matched.
6665     negptr(cnt2);
6666     addptr(cnt2, stride);
6667 
6668     bind(SCAN_SUBSTR);
6669     subl(cnt1, stride);
6670     cmpl(cnt2, -stride); // Do not read beyond substring
6671     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
6672     // Back-up strings to avoid reading beyond substring:
6673     // cnt1 = cnt1 - cnt2 + 8
6674     addl(cnt1, cnt2); // cnt2 is negative
6675     addl(cnt1, stride);
6676     movl(cnt2, stride); negptr(cnt2);
6677     bind(CONT_SCAN_SUBSTR);
6678     if (int_cnt2 < (int)G) {
6679       int tail_off1 = int_cnt2<<scale1;
6680       int tail_off2 = int_cnt2<<scale2;
6681       if (ae == StrIntrinsicNode::UL) {
6682         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
6683       } else {
6684         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
6685       }
6686       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
6687     } else {
6688       // calculate index in register to avoid integer overflow (int_cnt2*2)
6689       movl(tmp, int_cnt2);
6690       addptr(tmp, cnt2);
6691       if (ae == StrIntrinsicNode::UL) {
6692         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
6693       } else {
6694         movdqu(vec, Address(str2, tmp, scale2, 0));
6695       }
6696       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
6697     }
6698     // Need to reload strings pointers if not matched whole vector
6699     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
6700     addptr(cnt2, stride);
6701     jcc(Assembler::negative, SCAN_SUBSTR);
6702     // Fall through if found full substring
6703 
6704   } // (int_cnt2 > 8)
6705 
6706   bind(RET_FOUND);
6707   // Found result if we matched full small substring.
6708   // Compute substr offset
6709   subptr(result, str1);
6710   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
6711     shrl(result, 1); // index
6712   }
6713   bind(EXIT);
6714 
6715 } // string_indexofC8
6716 
6717 // Small strings are loaded through stack if they cross page boundary.
6718 void MacroAssembler::string_indexof(Register str1, Register str2,
6719                                     Register cnt1, Register cnt2,
6720                                     int int_cnt2,  Register result,
6721                                     XMMRegister vec, Register tmp,
6722                                     int ae) {
6723   ShortBranchVerifier sbv(this);
6724   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
6725   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
6726 
6727   //
6728   // int_cnt2 is length of small (< 8 chars) constant substring
6729   // or (-1) for non constant substring in which case its length
6730   // is in cnt2 register.
6731   //
6732   // Note, inline_string_indexOf() generates checks:
6733   // if (substr.count > string.count) return -1;
6734   // if (substr.count == 0) return 0;
6735   //
6736   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
6737   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
6738   // This method uses the pcmpestri instruction with bound registers
6739   //   inputs:
6740   //     xmm - substring
6741   //     rax - substring length (elements count)
6742   //     mem - scanned string
6743   //     rdx - string length (elements count)
6744   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
6745   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
6746   //   outputs:
6747   //     rcx - matched index in string
6748   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6749   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
6750   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
6751   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
6752 
6753   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
6754         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
6755         FOUND_CANDIDATE;
6756 
6757   { //========================================================
6758     // We don't know where these strings are located
6759     // and we can't read beyond them. Load them through stack.
6760     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
6761 
6762     movptr(tmp, rsp); // save old SP
6763 
6764     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
6765       if (int_cnt2 == (1>>scale2)) { // One byte
6766         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
6767         load_unsigned_byte(result, Address(str2, 0));
6768         movdl(vec, result); // move 32 bits
6769       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
6770         // Not enough header space in 32-bit VM: 12+3 = 15.
6771         movl(result, Address(str2, -1));
6772         shrl(result, 8);
6773         movdl(vec, result); // move 32 bits
6774       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
6775         load_unsigned_short(result, Address(str2, 0));
6776         movdl(vec, result); // move 32 bits
6777       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
6778         movdl(vec, Address(str2, 0)); // move 32 bits
6779       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
6780         movq(vec, Address(str2, 0));  // move 64 bits
6781       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
6782         // Array header size is 12 bytes in 32-bit VM
6783         // + 6 bytes for 3 chars == 18 bytes,
6784         // enough space to load vec and shift.
6785         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
6786         if (ae == StrIntrinsicNode::UL) {
6787           int tail_off = int_cnt2-8;
6788           pmovzxbw(vec, Address(str2, tail_off));
6789           psrldq(vec, -2*tail_off);
6790         }
6791         else {
6792           int tail_off = int_cnt2*(1<<scale2);
6793           movdqu(vec, Address(str2, tail_off-16));
6794           psrldq(vec, 16-tail_off);
6795         }
6796       }
6797     } else { // not constant substring
6798       cmpl(cnt2, stride);
6799       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
6800 
6801       // We can read beyond string if srt+16 does not cross page boundary
6802       // since heaps are aligned and mapped by pages.
6803       assert(os::vm_page_size() < (int)G, "default page should be small");
6804       movl(result, str2); // We need only low 32 bits
6805       andl(result, (os::vm_page_size()-1));
6806       cmpl(result, (os::vm_page_size()-16));
6807       jccb(Assembler::belowEqual, CHECK_STR);
6808 
6809       // Move small strings to stack to allow load 16 bytes into vec.
6810       subptr(rsp, 16);
6811       int stk_offset = wordSize-(1<<scale2);
6812       push(cnt2);
6813 
6814       bind(COPY_SUBSTR);
6815       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
6816         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
6817         movb(Address(rsp, cnt2, scale2, stk_offset), result);
6818       } else if (ae == StrIntrinsicNode::UU) {
6819         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
6820         movw(Address(rsp, cnt2, scale2, stk_offset), result);
6821       }
6822       decrement(cnt2);
6823       jccb(Assembler::notZero, COPY_SUBSTR);
6824 
6825       pop(cnt2);
6826       movptr(str2, rsp);  // New substring address
6827     } // non constant
6828 
6829     bind(CHECK_STR);
6830     cmpl(cnt1, stride);
6831     jccb(Assembler::aboveEqual, BIG_STRINGS);
6832 
6833     // Check cross page boundary.
6834     movl(result, str1); // We need only low 32 bits
6835     andl(result, (os::vm_page_size()-1));
6836     cmpl(result, (os::vm_page_size()-16));
6837     jccb(Assembler::belowEqual, BIG_STRINGS);
6838 
6839     subptr(rsp, 16);
6840     int stk_offset = -(1<<scale1);
6841     if (int_cnt2 < 0) { // not constant
6842       push(cnt2);
6843       stk_offset += wordSize;
6844     }
6845     movl(cnt2, cnt1);
6846 
6847     bind(COPY_STR);
6848     if (ae == StrIntrinsicNode::LL) {
6849       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
6850       movb(Address(rsp, cnt2, scale1, stk_offset), result);
6851     } else {
6852       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
6853       movw(Address(rsp, cnt2, scale1, stk_offset), result);
6854     }
6855     decrement(cnt2);
6856     jccb(Assembler::notZero, COPY_STR);
6857 
6858     if (int_cnt2 < 0) { // not constant
6859       pop(cnt2);
6860     }
6861     movptr(str1, rsp);  // New string address
6862 
6863     bind(BIG_STRINGS);
6864     // Load substring.
6865     if (int_cnt2 < 0) { // -1
6866       if (ae == StrIntrinsicNode::UL) {
6867         pmovzxbw(vec, Address(str2, 0));
6868       } else {
6869         movdqu(vec, Address(str2, 0));
6870       }
6871       push(cnt2);       // substr count
6872       push(str2);       // substr addr
6873       push(str1);       // string addr
6874     } else {
6875       // Small (< 8 chars) constant substrings are loaded already.
6876       movl(cnt2, int_cnt2);
6877     }
6878     push(tmp);  // original SP
6879 
6880   } // Finished loading
6881 
6882   //========================================================
6883   // Start search
6884   //
6885 
6886   movptr(result, str1); // string addr
6887 
6888   if (int_cnt2  < 0) {  // Only for non constant substring
6889     jmpb(SCAN_TO_SUBSTR);
6890 
6891     // SP saved at sp+0
6892     // String saved at sp+1*wordSize
6893     // Substr saved at sp+2*wordSize
6894     // Substr count saved at sp+3*wordSize
6895 
6896     // Reload substr for rescan, this code
6897     // is executed only for large substrings (> 8 chars)
6898     bind(RELOAD_SUBSTR);
6899     movptr(str2, Address(rsp, 2*wordSize));
6900     movl(cnt2, Address(rsp, 3*wordSize));
6901     if (ae == StrIntrinsicNode::UL) {
6902       pmovzxbw(vec, Address(str2, 0));
6903     } else {
6904       movdqu(vec, Address(str2, 0));
6905     }
6906     // We came here after the beginning of the substring was
6907     // matched but the rest of it was not so we need to search
6908     // again. Start from the next element after the previous match.
6909     subptr(str1, result); // Restore counter
6910     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
6911       shrl(str1, 1);
6912     }
6913     addl(cnt1, str1);
6914     decrementl(cnt1);   // Shift to next element
6915     cmpl(cnt1, cnt2);
6916     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6917 
6918     addptr(result, (1<<scale1));
6919   } // non constant
6920 
6921   // Scan string for start of substr in 16-byte vectors
6922   bind(SCAN_TO_SUBSTR);
6923   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6924   pcmpestri(vec, Address(result, 0), mode);
6925   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
6926   subl(cnt1, stride);
6927   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
6928   cmpl(cnt1, cnt2);
6929   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6930   addptr(result, 16);
6931 
6932   bind(ADJUST_STR);
6933   cmpl(cnt1, stride); // Do not read beyond string
6934   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
6935   // Back-up string to avoid reading beyond string.
6936   lea(result, Address(result, cnt1, scale1, -16));
6937   movl(cnt1, stride);
6938   jmpb(SCAN_TO_SUBSTR);
6939 
6940   // Found a potential substr
6941   bind(FOUND_CANDIDATE);
6942   // After pcmpestri tmp(rcx) contains matched element index
6943 
6944   // Make sure string is still long enough
6945   subl(cnt1, tmp);
6946   cmpl(cnt1, cnt2);
6947   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
6948   // Left less then substring.
6949 
6950   bind(RET_NOT_FOUND);
6951   movl(result, -1);
6952   jmpb(CLEANUP);
6953 
6954   bind(FOUND_SUBSTR);
6955   // Compute start addr of substr
6956   lea(result, Address(result, tmp, scale1));
6957   if (int_cnt2 > 0) { // Constant substring
6958     // Repeat search for small substring (< 8 chars)
6959     // from new point without reloading substring.
6960     // Have to check that we don't read beyond string.
6961     cmpl(tmp, stride-int_cnt2);
6962     jccb(Assembler::greater, ADJUST_STR);
6963     // Fall through if matched whole substring.
6964   } else { // non constant
6965     assert(int_cnt2 == -1, "should be != 0");
6966 
6967     addl(tmp, cnt2);
6968     // Found result if we matched whole substring.
6969     cmpl(tmp, stride);
6970     jccb(Assembler::lessEqual, RET_FOUND);
6971 
6972     // Repeat search for small substring (<= 8 chars)
6973     // from new point 'str1' without reloading substring.
6974     cmpl(cnt2, stride);
6975     // Have to check that we don't read beyond string.
6976     jccb(Assembler::lessEqual, ADJUST_STR);
6977 
6978     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
6979     // Compare the rest of substring (> 8 chars).
6980     movptr(str1, result);
6981 
6982     cmpl(tmp, cnt2);
6983     // First 8 chars are already matched.
6984     jccb(Assembler::equal, CHECK_NEXT);
6985 
6986     bind(SCAN_SUBSTR);
6987     pcmpestri(vec, Address(str1, 0), mode);
6988     // Need to reload strings pointers if not matched whole vector
6989     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
6990 
6991     bind(CHECK_NEXT);
6992     subl(cnt2, stride);
6993     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
6994     addptr(str1, 16);
6995     if (ae == StrIntrinsicNode::UL) {
6996       addptr(str2, 8);
6997     } else {
6998       addptr(str2, 16);
6999     }
7000     subl(cnt1, stride);
7001     cmpl(cnt2, stride); // Do not read beyond substring
7002     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
7003     // Back-up strings to avoid reading beyond substring.
7004 
7005     if (ae == StrIntrinsicNode::UL) {
7006       lea(str2, Address(str2, cnt2, scale2, -8));
7007       lea(str1, Address(str1, cnt2, scale1, -16));
7008     } else {
7009       lea(str2, Address(str2, cnt2, scale2, -16));
7010       lea(str1, Address(str1, cnt2, scale1, -16));
7011     }
7012     subl(cnt1, cnt2);
7013     movl(cnt2, stride);
7014     addl(cnt1, stride);
7015     bind(CONT_SCAN_SUBSTR);
7016     if (ae == StrIntrinsicNode::UL) {
7017       pmovzxbw(vec, Address(str2, 0));
7018     } else {
7019       movdqu(vec, Address(str2, 0));
7020     }
7021     jmp(SCAN_SUBSTR);
7022 
7023     bind(RET_FOUND_LONG);
7024     movptr(str1, Address(rsp, wordSize));
7025   } // non constant
7026 
7027   bind(RET_FOUND);
7028   // Compute substr offset
7029   subptr(result, str1);
7030   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
7031     shrl(result, 1); // index
7032   }
7033   bind(CLEANUP);
7034   pop(rsp); // restore SP
7035 
7036 } // string_indexof
7037 
7038 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
7039                                          XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
7040   ShortBranchVerifier sbv(this);
7041   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
7042 
7043   int stride = 8;
7044 
7045   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
7046         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
7047         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
7048         FOUND_SEQ_CHAR, DONE_LABEL;
7049 
7050   movptr(result, str1);
7051   if (UseAVX >= 2) {
7052     cmpl(cnt1, stride);
7053     jcc(Assembler::less, SCAN_TO_CHAR_LOOP);
7054     cmpl(cnt1, 2*stride);
7055     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
7056     movdl(vec1, ch);
7057     vpbroadcastw(vec1, vec1);
7058     vpxor(vec2, vec2);
7059     movl(tmp, cnt1);
7060     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
7061     andl(cnt1,0x0000000F);  //tail count (in chars)
7062 
7063     bind(SCAN_TO_16_CHAR_LOOP);
7064     vmovdqu(vec3, Address(result, 0));
7065     vpcmpeqw(vec3, vec3, vec1, 1);
7066     vptest(vec2, vec3);
7067     jcc(Assembler::carryClear, FOUND_CHAR);
7068     addptr(result, 32);
7069     subl(tmp, 2*stride);
7070     jccb(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
7071     jmp(SCAN_TO_8_CHAR);
7072     bind(SCAN_TO_8_CHAR_INIT);
7073     movdl(vec1, ch);
7074     pshuflw(vec1, vec1, 0x00);
7075     pshufd(vec1, vec1, 0);
7076     pxor(vec2, vec2);
7077   }
7078   bind(SCAN_TO_8_CHAR);
7079   cmpl(cnt1, stride);
7080   if (UseAVX >= 2) {
7081     jcc(Assembler::less, SCAN_TO_CHAR);
7082   } else {
7083     jcc(Assembler::less, SCAN_TO_CHAR_LOOP);
7084     movdl(vec1, ch);
7085     pshuflw(vec1, vec1, 0x00);
7086     pshufd(vec1, vec1, 0);
7087     pxor(vec2, vec2);
7088   }
7089   movl(tmp, cnt1);
7090   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
7091   andl(cnt1,0x00000007);  //tail count (in chars)
7092 
7093   bind(SCAN_TO_8_CHAR_LOOP);
7094   movdqu(vec3, Address(result, 0));
7095   pcmpeqw(vec3, vec1);
7096   ptest(vec2, vec3);
7097   jcc(Assembler::carryClear, FOUND_CHAR);
7098   addptr(result, 16);
7099   subl(tmp, stride);
7100   jccb(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
7101   bind(SCAN_TO_CHAR);
7102   testl(cnt1, cnt1);
7103   jcc(Assembler::zero, RET_NOT_FOUND);
7104   bind(SCAN_TO_CHAR_LOOP);
7105   load_unsigned_short(tmp, Address(result, 0));
7106   cmpl(ch, tmp);
7107   jccb(Assembler::equal, FOUND_SEQ_CHAR);
7108   addptr(result, 2);
7109   subl(cnt1, 1);
7110   jccb(Assembler::zero, RET_NOT_FOUND);
7111   jmp(SCAN_TO_CHAR_LOOP);
7112 
7113   bind(RET_NOT_FOUND);
7114   movl(result, -1);
7115   jmpb(DONE_LABEL);
7116 
7117   bind(FOUND_CHAR);
7118   if (UseAVX >= 2) {
7119     vpmovmskb(tmp, vec3);
7120   } else {
7121     pmovmskb(tmp, vec3);
7122   }
7123   bsfl(ch, tmp);
7124   addl(result, ch);
7125 
7126   bind(FOUND_SEQ_CHAR);
7127   subptr(result, str1);
7128   shrl(result, 1);
7129 
7130   bind(DONE_LABEL);
7131 } // string_indexof_char
7132 
7133 // helper function for string_compare
7134 void MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
7135                                         Address::ScaleFactor scale, Address::ScaleFactor scale1,
7136                                         Address::ScaleFactor scale2, Register index, int ae) {
7137   if (ae == StrIntrinsicNode::LL) {
7138     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
7139     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
7140   } else if (ae == StrIntrinsicNode::UU) {
7141     load_unsigned_short(elem1, Address(str1, index, scale, 0));
7142     load_unsigned_short(elem2, Address(str2, index, scale, 0));
7143   } else {
7144     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
7145     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
7146   }
7147 }
7148 
7149 // Compare strings, used for char[] and byte[].
7150 void MacroAssembler::string_compare(Register str1, Register str2,
7151                                     Register cnt1, Register cnt2, Register result,
7152                                     XMMRegister vec1, int ae) {
7153   ShortBranchVerifier sbv(this);
7154   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
7155   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
7156   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
7157   int stride2x2 = 0x40;
7158   Address::ScaleFactor scale = Address::no_scale;
7159   Address::ScaleFactor scale1 = Address::no_scale;
7160   Address::ScaleFactor scale2 = Address::no_scale;
7161 
7162   if (ae != StrIntrinsicNode::LL) {
7163     stride2x2 = 0x20;
7164   }
7165 
7166   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
7167     shrl(cnt2, 1);
7168   }
7169   // Compute the minimum of the string lengths and the
7170   // difference of the string lengths (stack).
7171   // Do the conditional move stuff
7172   movl(result, cnt1);
7173   subl(cnt1, cnt2);
7174   push(cnt1);
7175   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
7176 
7177   // Is the minimum length zero?
7178   testl(cnt2, cnt2);
7179   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
7180   if (ae == StrIntrinsicNode::LL) {
7181     // Load first bytes
7182     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
7183     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
7184   } else if (ae == StrIntrinsicNode::UU) {
7185     // Load first characters
7186     load_unsigned_short(result, Address(str1, 0));
7187     load_unsigned_short(cnt1, Address(str2, 0));
7188   } else {
7189     load_unsigned_byte(result, Address(str1, 0));
7190     load_unsigned_short(cnt1, Address(str2, 0));
7191   }
7192   subl(result, cnt1);
7193   jcc(Assembler::notZero,  POP_LABEL);
7194 
7195   if (ae == StrIntrinsicNode::UU) {
7196     // Divide length by 2 to get number of chars
7197     shrl(cnt2, 1);
7198   }
7199   cmpl(cnt2, 1);
7200   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
7201 
7202   // Check if the strings start at the same location and setup scale and stride
7203   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7204     cmpptr(str1, str2);
7205     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
7206     if (ae == StrIntrinsicNode::LL) {
7207       scale = Address::times_1;
7208       stride = 16;
7209     } else {
7210       scale = Address::times_2;
7211       stride = 8;
7212     }
7213   } else {
7214     scale1 = Address::times_1;
7215     scale2 = Address::times_2;
7216     // scale not used
7217     stride = 8;
7218   }
7219 
7220   if (UseAVX >= 2 && UseSSE42Intrinsics) {
7221     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
7222     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
7223     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
7224     Label COMPARE_TAIL_LONG;
7225     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
7226 
7227     int pcmpmask = 0x19;
7228     if (ae == StrIntrinsicNode::LL) {
7229       pcmpmask &= ~0x01;
7230     }
7231 
7232     // Setup to compare 16-chars (32-bytes) vectors,
7233     // start from first character again because it has aligned address.
7234     if (ae == StrIntrinsicNode::LL) {
7235       stride2 = 32;
7236     } else {
7237       stride2 = 16;
7238     }
7239     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7240       adr_stride = stride << scale;
7241     } else {
7242       adr_stride1 = 8;  //stride << scale1;
7243       adr_stride2 = 16; //stride << scale2;
7244     }
7245 
7246     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
7247     // rax and rdx are used by pcmpestri as elements counters
7248     movl(result, cnt2);
7249     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
7250     jcc(Assembler::zero, COMPARE_TAIL_LONG);
7251 
7252     // fast path : compare first 2 8-char vectors.
7253     bind(COMPARE_16_CHARS);
7254     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7255       movdqu(vec1, Address(str1, 0));
7256     } else {
7257       pmovzxbw(vec1, Address(str1, 0));
7258     }
7259     pcmpestri(vec1, Address(str2, 0), pcmpmask);
7260     jccb(Assembler::below, COMPARE_INDEX_CHAR);
7261 
7262     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7263       movdqu(vec1, Address(str1, adr_stride));
7264       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
7265     } else {
7266       pmovzxbw(vec1, Address(str1, adr_stride1));
7267       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
7268     }
7269     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
7270     addl(cnt1, stride);
7271 
7272     // Compare the characters at index in cnt1
7273     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
7274     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
7275     subl(result, cnt2);
7276     jmp(POP_LABEL);
7277 
7278     // Setup the registers to start vector comparison loop
7279     bind(COMPARE_WIDE_VECTORS);
7280     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7281       lea(str1, Address(str1, result, scale));
7282       lea(str2, Address(str2, result, scale));
7283     } else {
7284       lea(str1, Address(str1, result, scale1));
7285       lea(str2, Address(str2, result, scale2));
7286     }
7287     subl(result, stride2);
7288     subl(cnt2, stride2);
7289     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
7290     negptr(result);
7291 
7292     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
7293     bind(COMPARE_WIDE_VECTORS_LOOP);
7294 
7295 #ifdef _LP64
7296     if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
7297       cmpl(cnt2, stride2x2);
7298       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
7299       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
7300       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
7301 
7302       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
7303       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7304         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
7305         evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
7306       } else {
7307         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
7308         evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
7309       }
7310       kortestql(k7, k7);
7311       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
7312       addptr(result, stride2x2);  // update since we already compared at this addr
7313       subl(cnt2, stride2x2);      // and sub the size too
7314       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
7315 
7316       vpxor(vec1, vec1);
7317       jmpb(COMPARE_WIDE_TAIL);
7318     }//if (VM_Version::supports_avx512vlbw())
7319 #endif // _LP64
7320 
7321 
7322     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
7323     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7324       vmovdqu(vec1, Address(str1, result, scale));
7325       vpxor(vec1, Address(str2, result, scale));
7326     } else {
7327       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
7328       vpxor(vec1, Address(str2, result, scale2));
7329     }
7330     vptest(vec1, vec1);
7331     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
7332     addptr(result, stride2);
7333     subl(cnt2, stride2);
7334     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
7335     // clean upper bits of YMM registers
7336     vpxor(vec1, vec1);
7337 
7338     // compare wide vectors tail
7339     bind(COMPARE_WIDE_TAIL);
7340     testptr(result, result);
7341     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
7342 
7343     movl(result, stride2);
7344     movl(cnt2, result);
7345     negptr(result);
7346     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
7347 
7348     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
7349     bind(VECTOR_NOT_EQUAL);
7350     // clean upper bits of YMM registers
7351     vpxor(vec1, vec1);
7352     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7353       lea(str1, Address(str1, result, scale));
7354       lea(str2, Address(str2, result, scale));
7355     } else {
7356       lea(str1, Address(str1, result, scale1));
7357       lea(str2, Address(str2, result, scale2));
7358     }
7359     jmp(COMPARE_16_CHARS);
7360 
7361     // Compare tail chars, length between 1 to 15 chars
7362     bind(COMPARE_TAIL_LONG);
7363     movl(cnt2, result);
7364     cmpl(cnt2, stride);
7365     jcc(Assembler::less, COMPARE_SMALL_STR);
7366 
7367     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7368       movdqu(vec1, Address(str1, 0));
7369     } else {
7370       pmovzxbw(vec1, Address(str1, 0));
7371     }
7372     pcmpestri(vec1, Address(str2, 0), pcmpmask);
7373     jcc(Assembler::below, COMPARE_INDEX_CHAR);
7374     subptr(cnt2, stride);
7375     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
7376     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7377       lea(str1, Address(str1, result, scale));
7378       lea(str2, Address(str2, result, scale));
7379     } else {
7380       lea(str1, Address(str1, result, scale1));
7381       lea(str2, Address(str2, result, scale2));
7382     }
7383     negptr(cnt2);
7384     jmpb(WHILE_HEAD_LABEL);
7385 
7386     bind(COMPARE_SMALL_STR);
7387   } else if (UseSSE42Intrinsics) {
7388     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
7389     int pcmpmask = 0x19;
7390     // Setup to compare 8-char (16-byte) vectors,
7391     // start from first character again because it has aligned address.
7392     movl(result, cnt2);
7393     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
7394     if (ae == StrIntrinsicNode::LL) {
7395       pcmpmask &= ~0x01;
7396     }
7397     jcc(Assembler::zero, COMPARE_TAIL);
7398     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7399       lea(str1, Address(str1, result, scale));
7400       lea(str2, Address(str2, result, scale));
7401     } else {
7402       lea(str1, Address(str1, result, scale1));
7403       lea(str2, Address(str2, result, scale2));
7404     }
7405     negptr(result);
7406 
7407     // pcmpestri
7408     //   inputs:
7409     //     vec1- substring
7410     //     rax - negative string length (elements count)
7411     //     mem - scanned string
7412     //     rdx - string length (elements count)
7413     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
7414     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
7415     //   outputs:
7416     //     rcx - first mismatched element index
7417     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
7418 
7419     bind(COMPARE_WIDE_VECTORS);
7420     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7421       movdqu(vec1, Address(str1, result, scale));
7422       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
7423     } else {
7424       pmovzxbw(vec1, Address(str1, result, scale1));
7425       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
7426     }
7427     // After pcmpestri cnt1(rcx) contains mismatched element index
7428 
7429     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
7430     addptr(result, stride);
7431     subptr(cnt2, stride);
7432     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
7433 
7434     // compare wide vectors tail
7435     testptr(result, result);
7436     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
7437 
7438     movl(cnt2, stride);
7439     movl(result, stride);
7440     negptr(result);
7441     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7442       movdqu(vec1, Address(str1, result, scale));
7443       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
7444     } else {
7445       pmovzxbw(vec1, Address(str1, result, scale1));
7446       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
7447     }
7448     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
7449 
7450     // Mismatched characters in the vectors
7451     bind(VECTOR_NOT_EQUAL);
7452     addptr(cnt1, result);
7453     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
7454     subl(result, cnt2);
7455     jmpb(POP_LABEL);
7456 
7457     bind(COMPARE_TAIL); // limit is zero
7458     movl(cnt2, result);
7459     // Fallthru to tail compare
7460   }
7461   // Shift str2 and str1 to the end of the arrays, negate min
7462   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7463     lea(str1, Address(str1, cnt2, scale));
7464     lea(str2, Address(str2, cnt2, scale));
7465   } else {
7466     lea(str1, Address(str1, cnt2, scale1));
7467     lea(str2, Address(str2, cnt2, scale2));
7468   }
7469   decrementl(cnt2);  // first character was compared already
7470   negptr(cnt2);
7471 
7472   // Compare the rest of the elements
7473   bind(WHILE_HEAD_LABEL);
7474   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
7475   subl(result, cnt1);
7476   jccb(Assembler::notZero, POP_LABEL);
7477   increment(cnt2);
7478   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
7479 
7480   // Strings are equal up to min length.  Return the length difference.
7481   bind(LENGTH_DIFF_LABEL);
7482   pop(result);
7483   if (ae == StrIntrinsicNode::UU) {
7484     // Divide diff by 2 to get number of chars
7485     sarl(result, 1);
7486   }
7487   jmpb(DONE_LABEL);
7488 
7489 #ifdef _LP64
7490   if (VM_Version::supports_avx512vlbw()) {
7491 
7492     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
7493 
7494     kmovql(cnt1, k7);
7495     notq(cnt1);
7496     bsfq(cnt2, cnt1);
7497     if (ae != StrIntrinsicNode::LL) {
7498       // Divide diff by 2 to get number of chars
7499       sarl(cnt2, 1);
7500     }
7501     addq(result, cnt2);
7502     if (ae == StrIntrinsicNode::LL) {
7503       load_unsigned_byte(cnt1, Address(str2, result));
7504       load_unsigned_byte(result, Address(str1, result));
7505     } else if (ae == StrIntrinsicNode::UU) {
7506       load_unsigned_short(cnt1, Address(str2, result, scale));
7507       load_unsigned_short(result, Address(str1, result, scale));
7508     } else {
7509       load_unsigned_short(cnt1, Address(str2, result, scale2));
7510       load_unsigned_byte(result, Address(str1, result, scale1));
7511     }
7512     subl(result, cnt1);
7513     jmpb(POP_LABEL);
7514   }//if (VM_Version::supports_avx512vlbw())
7515 #endif // _LP64
7516 
7517   // Discard the stored length difference
7518   bind(POP_LABEL);
7519   pop(cnt1);
7520 
7521   // That's it
7522   bind(DONE_LABEL);
7523   if(ae == StrIntrinsicNode::UL) {
7524     negl(result);
7525   }
7526 
7527 }
7528 
7529 // Search for Non-ASCII character (Negative byte value) in a byte array,
7530 // return true if it has any and false otherwise.
7531 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
7532 //   @HotSpotIntrinsicCandidate
7533 //   private static boolean hasNegatives(byte[] ba, int off, int len) {
7534 //     for (int i = off; i < off + len; i++) {
7535 //       if (ba[i] < 0) {
7536 //         return true;
7537 //       }
7538 //     }
7539 //     return false;
7540 //   }
7541 void MacroAssembler::has_negatives(Register ary1, Register len,
7542   Register result, Register tmp1,
7543   XMMRegister vec1, XMMRegister vec2) {
7544   // rsi: byte array
7545   // rcx: len
7546   // rax: result
7547   ShortBranchVerifier sbv(this);
7548   assert_different_registers(ary1, len, result, tmp1);
7549   assert_different_registers(vec1, vec2);
7550   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
7551 
7552   // len == 0
7553   testl(len, len);
7554   jcc(Assembler::zero, FALSE_LABEL);
7555 
7556   if ((UseAVX > 2) && // AVX512
7557     VM_Version::supports_avx512vlbw() &&
7558     VM_Version::supports_bmi2()) {
7559 
7560     set_vector_masking();  // opening of the stub context for programming mask registers
7561 
7562     Label test_64_loop, test_tail;
7563     Register tmp3_aliased = len;
7564 
7565     movl(tmp1, len);
7566     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
7567 
7568     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
7569     andl(len, ~(64 - 1));    // vector count (in chars)
7570     jccb(Assembler::zero, test_tail);
7571 
7572     lea(ary1, Address(ary1, len, Address::times_1));
7573     negptr(len);
7574 
7575     bind(test_64_loop);
7576     // Check whether our 64 elements of size byte contain negatives
7577     evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
7578     kortestql(k2, k2);
7579     jcc(Assembler::notZero, TRUE_LABEL);
7580 
7581     addptr(len, 64);
7582     jccb(Assembler::notZero, test_64_loop);
7583 
7584 
7585     bind(test_tail);
7586     // bail out when there is nothing to be done
7587     testl(tmp1, -1);
7588     jcc(Assembler::zero, FALSE_LABEL);
7589 
7590     // Save k1
7591     kmovql(k3, k1);
7592 
7593     // ~(~0 << len) applied up to two times (for 32-bit scenario)
7594 #ifdef _LP64
7595     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
7596     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
7597     notq(tmp3_aliased);
7598     kmovql(k1, tmp3_aliased);
7599 #else
7600     Label k_init;
7601     jmp(k_init);
7602 
7603     // We could not read 64-bits from a general purpose register thus we move
7604     // data required to compose 64 1's to the instruction stream
7605     // We emit 64 byte wide series of elements from 0..63 which later on would
7606     // be used as a compare targets with tail count contained in tmp1 register.
7607     // Result would be a k1 register having tmp1 consecutive number or 1
7608     // counting from least significant bit.
7609     address tmp = pc();
7610     emit_int64(0x0706050403020100);
7611     emit_int64(0x0F0E0D0C0B0A0908);
7612     emit_int64(0x1716151413121110);
7613     emit_int64(0x1F1E1D1C1B1A1918);
7614     emit_int64(0x2726252423222120);
7615     emit_int64(0x2F2E2D2C2B2A2928);
7616     emit_int64(0x3736353433323130);
7617     emit_int64(0x3F3E3D3C3B3A3938);
7618 
7619     bind(k_init);
7620     lea(len, InternalAddress(tmp));
7621     // create mask to test for negative byte inside a vector
7622     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
7623     evpcmpgtb(k1, vec1, Address(len, 0), Assembler::AVX_512bit);
7624 
7625 #endif
7626     evpcmpgtb(k2, k1, vec2, Address(ary1, 0), Assembler::AVX_512bit);
7627     ktestq(k2, k1);
7628     // Restore k1
7629     kmovql(k1, k3);
7630     jcc(Assembler::notZero, TRUE_LABEL);
7631 
7632     jmp(FALSE_LABEL);
7633 
7634     clear_vector_masking();   // closing of the stub context for programming mask registers
7635   } else {
7636     movl(result, len); // copy
7637 
7638     if (UseAVX == 2 && UseSSE >= 2) {
7639       // With AVX2, use 32-byte vector compare
7640       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7641 
7642       // Compare 32-byte vectors
7643       andl(result, 0x0000001f);  //   tail count (in bytes)
7644       andl(len, 0xffffffe0);   // vector count (in bytes)
7645       jccb(Assembler::zero, COMPARE_TAIL);
7646 
7647       lea(ary1, Address(ary1, len, Address::times_1));
7648       negptr(len);
7649 
7650       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
7651       movdl(vec2, tmp1);
7652       vpbroadcastd(vec2, vec2);
7653 
7654       bind(COMPARE_WIDE_VECTORS);
7655       vmovdqu(vec1, Address(ary1, len, Address::times_1));
7656       vptest(vec1, vec2);
7657       jccb(Assembler::notZero, TRUE_LABEL);
7658       addptr(len, 32);
7659       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7660 
7661       testl(result, result);
7662       jccb(Assembler::zero, FALSE_LABEL);
7663 
7664       vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
7665       vptest(vec1, vec2);
7666       jccb(Assembler::notZero, TRUE_LABEL);
7667       jmpb(FALSE_LABEL);
7668 
7669       bind(COMPARE_TAIL); // len is zero
7670       movl(len, result);
7671       // Fallthru to tail compare
7672     } else if (UseSSE42Intrinsics) {
7673       // With SSE4.2, use double quad vector compare
7674       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7675 
7676       // Compare 16-byte vectors
7677       andl(result, 0x0000000f);  //   tail count (in bytes)
7678       andl(len, 0xfffffff0);   // vector count (in bytes)
7679       jccb(Assembler::zero, COMPARE_TAIL);
7680 
7681       lea(ary1, Address(ary1, len, Address::times_1));
7682       negptr(len);
7683 
7684       movl(tmp1, 0x80808080);
7685       movdl(vec2, tmp1);
7686       pshufd(vec2, vec2, 0);
7687 
7688       bind(COMPARE_WIDE_VECTORS);
7689       movdqu(vec1, Address(ary1, len, Address::times_1));
7690       ptest(vec1, vec2);
7691       jccb(Assembler::notZero, TRUE_LABEL);
7692       addptr(len, 16);
7693       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7694 
7695       testl(result, result);
7696       jccb(Assembler::zero, FALSE_LABEL);
7697 
7698       movdqu(vec1, Address(ary1, result, Address::times_1, -16));
7699       ptest(vec1, vec2);
7700       jccb(Assembler::notZero, TRUE_LABEL);
7701       jmpb(FALSE_LABEL);
7702 
7703       bind(COMPARE_TAIL); // len is zero
7704       movl(len, result);
7705       // Fallthru to tail compare
7706     }
7707   }
7708   // Compare 4-byte vectors
7709   andl(len, 0xfffffffc); // vector count (in bytes)
7710   jccb(Assembler::zero, COMPARE_CHAR);
7711 
7712   lea(ary1, Address(ary1, len, Address::times_1));
7713   negptr(len);
7714 
7715   bind(COMPARE_VECTORS);
7716   movl(tmp1, Address(ary1, len, Address::times_1));
7717   andl(tmp1, 0x80808080);
7718   jccb(Assembler::notZero, TRUE_LABEL);
7719   addptr(len, 4);
7720   jcc(Assembler::notZero, COMPARE_VECTORS);
7721 
7722   // Compare trailing char (final 2 bytes), if any
7723   bind(COMPARE_CHAR);
7724   testl(result, 0x2);   // tail  char
7725   jccb(Assembler::zero, COMPARE_BYTE);
7726   load_unsigned_short(tmp1, Address(ary1, 0));
7727   andl(tmp1, 0x00008080);
7728   jccb(Assembler::notZero, TRUE_LABEL);
7729   subptr(result, 2);
7730   lea(ary1, Address(ary1, 2));
7731 
7732   bind(COMPARE_BYTE);
7733   testl(result, 0x1);   // tail  byte
7734   jccb(Assembler::zero, FALSE_LABEL);
7735   load_unsigned_byte(tmp1, Address(ary1, 0));
7736   andl(tmp1, 0x00000080);
7737   jccb(Assembler::notEqual, TRUE_LABEL);
7738   jmpb(FALSE_LABEL);
7739 
7740   bind(TRUE_LABEL);
7741   movl(result, 1);   // return true
7742   jmpb(DONE);
7743 
7744   bind(FALSE_LABEL);
7745   xorl(result, result); // return false
7746 
7747   // That's it
7748   bind(DONE);
7749   if (UseAVX >= 2 && UseSSE >= 2) {
7750     // clean upper bits of YMM registers
7751     vpxor(vec1, vec1);
7752     vpxor(vec2, vec2);
7753   }
7754 }
7755 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
7756 void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
7757                                    Register limit, Register result, Register chr,
7758                                    XMMRegister vec1, XMMRegister vec2, bool is_char) {
7759   ShortBranchVerifier sbv(this);
7760   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
7761 
7762   int length_offset  = arrayOopDesc::length_offset_in_bytes();
7763   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
7764 
7765   if (is_array_equ) {
7766     // Check the input args
7767     cmpoop(ary1, ary2);
7768     jcc(Assembler::equal, TRUE_LABEL);
7769 
7770     // Need additional checks for arrays_equals.
7771     testptr(ary1, ary1);
7772     jcc(Assembler::zero, FALSE_LABEL);
7773     testptr(ary2, ary2);
7774     jcc(Assembler::zero, FALSE_LABEL);
7775 
7776     // Check the lengths
7777     movl(limit, Address(ary1, length_offset));
7778     cmpl(limit, Address(ary2, length_offset));
7779     jcc(Assembler::notEqual, FALSE_LABEL);
7780   }
7781 
7782   // count == 0
7783   testl(limit, limit);
7784   jcc(Assembler::zero, TRUE_LABEL);
7785 
7786   if (is_array_equ) {
7787     // Load array address
7788     lea(ary1, Address(ary1, base_offset));
7789     lea(ary2, Address(ary2, base_offset));
7790   }
7791 
7792   if (is_array_equ && is_char) {
7793     // arrays_equals when used for char[].
7794     shll(limit, 1);      // byte count != 0
7795   }
7796   movl(result, limit); // copy
7797 
7798   if (UseAVX >= 2) {
7799     // With AVX2, use 32-byte vector compare
7800     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7801 
7802     // Compare 32-byte vectors
7803     andl(result, 0x0000001f);  //   tail count (in bytes)
7804     andl(limit, 0xffffffe0);   // vector count (in bytes)
7805     jcc(Assembler::zero, COMPARE_TAIL);
7806 
7807     lea(ary1, Address(ary1, limit, Address::times_1));
7808     lea(ary2, Address(ary2, limit, Address::times_1));
7809     negptr(limit);
7810 
7811     bind(COMPARE_WIDE_VECTORS);
7812 
7813 #ifdef _LP64
7814     if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
7815       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
7816 
7817       cmpl(limit, -64);
7818       jccb(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
7819 
7820       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
7821 
7822       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
7823       evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
7824       kortestql(k7, k7);
7825       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
7826       addptr(limit, 64);  // update since we already compared at this addr
7827       cmpl(limit, -64);
7828       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
7829 
7830       // At this point we may still need to compare -limit+result bytes.
7831       // We could execute the next two instruction and just continue via non-wide path:
7832       //  cmpl(limit, 0);
7833       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
7834       // But since we stopped at the points ary{1,2}+limit which are
7835       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
7836       // (|limit| <= 32 and result < 32),
7837       // we may just compare the last 64 bytes.
7838       //
7839       addptr(result, -64);   // it is safe, bc we just came from this area
7840       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
7841       evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
7842       kortestql(k7, k7);
7843       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
7844 
7845       jmp(TRUE_LABEL);
7846 
7847       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
7848 
7849     }//if (VM_Version::supports_avx512vlbw())
7850 #endif //_LP64
7851 
7852     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
7853     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
7854     vpxor(vec1, vec2);
7855 
7856     vptest(vec1, vec1);
7857     jcc(Assembler::notZero, FALSE_LABEL);
7858     addptr(limit, 32);
7859     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7860 
7861     testl(result, result);
7862     jcc(Assembler::zero, TRUE_LABEL);
7863 
7864     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
7865     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
7866     vpxor(vec1, vec2);
7867 
7868     vptest(vec1, vec1);
7869     jccb(Assembler::notZero, FALSE_LABEL);
7870     jmpb(TRUE_LABEL);
7871 
7872     bind(COMPARE_TAIL); // limit is zero
7873     movl(limit, result);
7874     // Fallthru to tail compare
7875   } else if (UseSSE42Intrinsics) {
7876     // With SSE4.2, use double quad vector compare
7877     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7878 
7879     // Compare 16-byte vectors
7880     andl(result, 0x0000000f);  //   tail count (in bytes)
7881     andl(limit, 0xfffffff0);   // vector count (in bytes)
7882     jcc(Assembler::zero, COMPARE_TAIL);
7883 
7884     lea(ary1, Address(ary1, limit, Address::times_1));
7885     lea(ary2, Address(ary2, limit, Address::times_1));
7886     negptr(limit);
7887 
7888     bind(COMPARE_WIDE_VECTORS);
7889     movdqu(vec1, Address(ary1, limit, Address::times_1));
7890     movdqu(vec2, Address(ary2, limit, Address::times_1));
7891     pxor(vec1, vec2);
7892 
7893     ptest(vec1, vec1);
7894     jcc(Assembler::notZero, FALSE_LABEL);
7895     addptr(limit, 16);
7896     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7897 
7898     testl(result, result);
7899     jcc(Assembler::zero, TRUE_LABEL);
7900 
7901     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
7902     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
7903     pxor(vec1, vec2);
7904 
7905     ptest(vec1, vec1);
7906     jccb(Assembler::notZero, FALSE_LABEL);
7907     jmpb(TRUE_LABEL);
7908 
7909     bind(COMPARE_TAIL); // limit is zero
7910     movl(limit, result);
7911     // Fallthru to tail compare
7912   }
7913 
7914   // Compare 4-byte vectors
7915   andl(limit, 0xfffffffc); // vector count (in bytes)
7916   jccb(Assembler::zero, COMPARE_CHAR);
7917 
7918   lea(ary1, Address(ary1, limit, Address::times_1));
7919   lea(ary2, Address(ary2, limit, Address::times_1));
7920   negptr(limit);
7921 
7922   bind(COMPARE_VECTORS);
7923   movl(chr, Address(ary1, limit, Address::times_1));
7924   cmpl(chr, Address(ary2, limit, Address::times_1));
7925   jccb(Assembler::notEqual, FALSE_LABEL);
7926   addptr(limit, 4);
7927   jcc(Assembler::notZero, COMPARE_VECTORS);
7928 
7929   // Compare trailing char (final 2 bytes), if any
7930   bind(COMPARE_CHAR);
7931   testl(result, 0x2);   // tail  char
7932   jccb(Assembler::zero, COMPARE_BYTE);
7933   load_unsigned_short(chr, Address(ary1, 0));
7934   load_unsigned_short(limit, Address(ary2, 0));
7935   cmpl(chr, limit);
7936   jccb(Assembler::notEqual, FALSE_LABEL);
7937 
7938   if (is_array_equ && is_char) {
7939     bind(COMPARE_BYTE);
7940   } else {
7941     lea(ary1, Address(ary1, 2));
7942     lea(ary2, Address(ary2, 2));
7943 
7944     bind(COMPARE_BYTE);
7945     testl(result, 0x1);   // tail  byte
7946     jccb(Assembler::zero, TRUE_LABEL);
7947     load_unsigned_byte(chr, Address(ary1, 0));
7948     load_unsigned_byte(limit, Address(ary2, 0));
7949     cmpl(chr, limit);
7950     jccb(Assembler::notEqual, FALSE_LABEL);
7951   }
7952   bind(TRUE_LABEL);
7953   movl(result, 1);   // return true
7954   jmpb(DONE);
7955 
7956   bind(FALSE_LABEL);
7957   xorl(result, result); // return false
7958 
7959   // That's it
7960   bind(DONE);
7961   if (UseAVX >= 2) {
7962     // clean upper bits of YMM registers
7963     vpxor(vec1, vec1);
7964     vpxor(vec2, vec2);
7965   }
7966 }
7967 
7968 #endif
7969 
7970 void MacroAssembler::generate_fill(BasicType t, bool aligned,
7971                                    Register to, Register value, Register count,
7972                                    Register rtmp, XMMRegister xtmp) {
7973   ShortBranchVerifier sbv(this);
7974   assert_different_registers(to, value, count, rtmp);
7975   Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
7976   Label L_fill_2_bytes, L_fill_4_bytes;
7977 
7978   int shift = -1;
7979   switch (t) {
7980     case T_BYTE:
7981       shift = 2;
7982       break;
7983     case T_SHORT:
7984       shift = 1;
7985       break;
7986     case T_INT:
7987       shift = 0;
7988       break;
7989     default: ShouldNotReachHere();
7990   }
7991 
7992   if (t == T_BYTE) {
7993     andl(value, 0xff);
7994     movl(rtmp, value);
7995     shll(rtmp, 8);
7996     orl(value, rtmp);
7997   }
7998   if (t == T_SHORT) {
7999     andl(value, 0xffff);
8000   }
8001   if (t == T_BYTE || t == T_SHORT) {
8002     movl(rtmp, value);
8003     shll(rtmp, 16);
8004     orl(value, rtmp);
8005   }
8006 
8007   cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
8008   jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
8009   if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
8010     // align source address at 4 bytes address boundary
8011     if (t == T_BYTE) {
8012       // One byte misalignment happens only for byte arrays
8013       testptr(to, 1);
8014       jccb(Assembler::zero, L_skip_align1);
8015       movb(Address(to, 0), value);
8016       increment(to);
8017       decrement(count);
8018       BIND(L_skip_align1);
8019     }
8020     // Two bytes misalignment happens only for byte and short (char) arrays
8021     testptr(to, 2);
8022     jccb(Assembler::zero, L_skip_align2);
8023     movw(Address(to, 0), value);
8024     addptr(to, 2);
8025     subl(count, 1<<(shift-1));
8026     BIND(L_skip_align2);
8027   }
8028   if (UseSSE < 2) {
8029     Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
8030     // Fill 32-byte chunks
8031     subl(count, 8 << shift);
8032     jcc(Assembler::less, L_check_fill_8_bytes);
8033     align(16);
8034 
8035     BIND(L_fill_32_bytes_loop);
8036 
8037     for (int i = 0; i < 32; i += 4) {
8038       movl(Address(to, i), value);
8039     }
8040 
8041     addptr(to, 32);
8042     subl(count, 8 << shift);
8043     jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
8044     BIND(L_check_fill_8_bytes);
8045     addl(count, 8 << shift);
8046     jccb(Assembler::zero, L_exit);
8047     jmpb(L_fill_8_bytes);
8048 
8049     //
8050     // length is too short, just fill qwords
8051     //
8052     BIND(L_fill_8_bytes_loop);
8053     movl(Address(to, 0), value);
8054     movl(Address(to, 4), value);
8055     addptr(to, 8);
8056     BIND(L_fill_8_bytes);
8057     subl(count, 1 << (shift + 1));
8058     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
8059     // fall through to fill 4 bytes
8060   } else {
8061     Label L_fill_32_bytes;
8062     if (!UseUnalignedLoadStores) {
8063       // align to 8 bytes, we know we are 4 byte aligned to start
8064       testptr(to, 4);
8065       jccb(Assembler::zero, L_fill_32_bytes);
8066       movl(Address(to, 0), value);
8067       addptr(to, 4);
8068       subl(count, 1<<shift);
8069     }
8070     BIND(L_fill_32_bytes);
8071     {
8072       assert( UseSSE >= 2, "supported cpu only" );
8073       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
8074       if (UseAVX > 2) {
8075         movl(rtmp, 0xffff);
8076         kmovwl(k1, rtmp);
8077       }
8078       movdl(xtmp, value);
8079       if (UseAVX > 2 && UseUnalignedLoadStores) {
8080         // Fill 64-byte chunks
8081         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
8082         evpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
8083 
8084         subl(count, 16 << shift);
8085         jcc(Assembler::less, L_check_fill_32_bytes);
8086         align(16);
8087 
8088         BIND(L_fill_64_bytes_loop);
8089         evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
8090         addptr(to, 64);
8091         subl(count, 16 << shift);
8092         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
8093 
8094         BIND(L_check_fill_32_bytes);
8095         addl(count, 8 << shift);
8096         jccb(Assembler::less, L_check_fill_8_bytes);
8097         vmovdqu(Address(to, 0), xtmp);
8098         addptr(to, 32);
8099         subl(count, 8 << shift);
8100 
8101         BIND(L_check_fill_8_bytes);
8102       } else if (UseAVX == 2 && UseUnalignedLoadStores) {
8103         // Fill 64-byte chunks
8104         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
8105         vpbroadcastd(xtmp, xtmp);
8106 
8107         subl(count, 16 << shift);
8108         jcc(Assembler::less, L_check_fill_32_bytes);
8109         align(16);
8110 
8111         BIND(L_fill_64_bytes_loop);
8112         vmovdqu(Address(to, 0), xtmp);
8113         vmovdqu(Address(to, 32), xtmp);
8114         addptr(to, 64);
8115         subl(count, 16 << shift);
8116         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
8117 
8118         BIND(L_check_fill_32_bytes);
8119         addl(count, 8 << shift);
8120         jccb(Assembler::less, L_check_fill_8_bytes);
8121         vmovdqu(Address(to, 0), xtmp);
8122         addptr(to, 32);
8123         subl(count, 8 << shift);
8124 
8125         BIND(L_check_fill_8_bytes);
8126         // clean upper bits of YMM registers
8127         movdl(xtmp, value);
8128         pshufd(xtmp, xtmp, 0);
8129       } else {
8130         // Fill 32-byte chunks
8131         pshufd(xtmp, xtmp, 0);
8132 
8133         subl(count, 8 << shift);
8134         jcc(Assembler::less, L_check_fill_8_bytes);
8135         align(16);
8136 
8137         BIND(L_fill_32_bytes_loop);
8138 
8139         if (UseUnalignedLoadStores) {
8140           movdqu(Address(to, 0), xtmp);
8141           movdqu(Address(to, 16), xtmp);
8142         } else {
8143           movq(Address(to, 0), xtmp);
8144           movq(Address(to, 8), xtmp);
8145           movq(Address(to, 16), xtmp);
8146           movq(Address(to, 24), xtmp);
8147         }
8148 
8149         addptr(to, 32);
8150         subl(count, 8 << shift);
8151         jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
8152 
8153         BIND(L_check_fill_8_bytes);
8154       }
8155       addl(count, 8 << shift);
8156       jccb(Assembler::zero, L_exit);
8157       jmpb(L_fill_8_bytes);
8158 
8159       //
8160       // length is too short, just fill qwords
8161       //
8162       BIND(L_fill_8_bytes_loop);
8163       movq(Address(to, 0), xtmp);
8164       addptr(to, 8);
8165       BIND(L_fill_8_bytes);
8166       subl(count, 1 << (shift + 1));
8167       jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
8168     }
8169   }
8170   // fill trailing 4 bytes
8171   BIND(L_fill_4_bytes);
8172   testl(count, 1<<shift);
8173   jccb(Assembler::zero, L_fill_2_bytes);
8174   movl(Address(to, 0), value);
8175   if (t == T_BYTE || t == T_SHORT) {
8176     addptr(to, 4);
8177     BIND(L_fill_2_bytes);
8178     // fill trailing 2 bytes
8179     testl(count, 1<<(shift-1));
8180     jccb(Assembler::zero, L_fill_byte);
8181     movw(Address(to, 0), value);
8182     if (t == T_BYTE) {
8183       addptr(to, 2);
8184       BIND(L_fill_byte);
8185       // fill trailing byte
8186       testl(count, 1);
8187       jccb(Assembler::zero, L_exit);
8188       movb(Address(to, 0), value);
8189     } else {
8190       BIND(L_fill_byte);
8191     }
8192   } else {
8193     BIND(L_fill_2_bytes);
8194   }
8195   BIND(L_exit);
8196 }
8197 
8198 // encode char[] to byte[] in ISO_8859_1
8199    //@HotSpotIntrinsicCandidate
8200    //private static int implEncodeISOArray(byte[] sa, int sp,
8201    //byte[] da, int dp, int len) {
8202    //  int i = 0;
8203    //  for (; i < len; i++) {
8204    //    char c = StringUTF16.getChar(sa, sp++);
8205    //    if (c > '\u00FF')
8206    //      break;
8207    //    da[dp++] = (byte)c;
8208    //  }
8209    //  return i;
8210    //}
8211 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
8212   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8213   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8214   Register tmp5, Register result) {
8215 
8216   // rsi: src
8217   // rdi: dst
8218   // rdx: len
8219   // rcx: tmp5
8220   // rax: result
8221   ShortBranchVerifier sbv(this);
8222   assert_different_registers(src, dst, len, tmp5, result);
8223   Label L_done, L_copy_1_char, L_copy_1_char_exit;
8224 
8225   // set result
8226   xorl(result, result);
8227   // check for zero length
8228   testl(len, len);
8229   jcc(Assembler::zero, L_done);
8230 
8231   movl(result, len);
8232 
8233   // Setup pointers
8234   lea(src, Address(src, len, Address::times_2)); // char[]
8235   lea(dst, Address(dst, len, Address::times_1)); // byte[]
8236   negptr(len);
8237 
8238   if (UseSSE42Intrinsics || UseAVX >= 2) {
8239     Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit;
8240     Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
8241 
8242     if (UseAVX >= 2) {
8243       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
8244       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
8245       movdl(tmp1Reg, tmp5);
8246       vpbroadcastd(tmp1Reg, tmp1Reg);
8247       jmp(L_chars_32_check);
8248 
8249       bind(L_copy_32_chars);
8250       vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
8251       vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
8252       vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
8253       vptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
8254       jccb(Assembler::notZero, L_copy_32_chars_exit);
8255       vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
8256       vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
8257       vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
8258 
8259       bind(L_chars_32_check);
8260       addptr(len, 32);
8261       jcc(Assembler::lessEqual, L_copy_32_chars);
8262 
8263       bind(L_copy_32_chars_exit);
8264       subptr(len, 16);
8265       jccb(Assembler::greater, L_copy_16_chars_exit);
8266 
8267     } else if (UseSSE42Intrinsics) {
8268       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
8269       movdl(tmp1Reg, tmp5);
8270       pshufd(tmp1Reg, tmp1Reg, 0);
8271       jmpb(L_chars_16_check);
8272     }
8273 
8274     bind(L_copy_16_chars);
8275     if (UseAVX >= 2) {
8276       vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
8277       vptest(tmp2Reg, tmp1Reg);
8278       jcc(Assembler::notZero, L_copy_16_chars_exit);
8279       vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
8280       vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
8281     } else {
8282       if (UseAVX > 0) {
8283         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
8284         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
8285         vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
8286       } else {
8287         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
8288         por(tmp2Reg, tmp3Reg);
8289         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
8290         por(tmp2Reg, tmp4Reg);
8291       }
8292       ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
8293       jccb(Assembler::notZero, L_copy_16_chars_exit);
8294       packuswb(tmp3Reg, tmp4Reg);
8295     }
8296     movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
8297 
8298     bind(L_chars_16_check);
8299     addptr(len, 16);
8300     jcc(Assembler::lessEqual, L_copy_16_chars);
8301 
8302     bind(L_copy_16_chars_exit);
8303     if (UseAVX >= 2) {
8304       // clean upper bits of YMM registers
8305       vpxor(tmp2Reg, tmp2Reg);
8306       vpxor(tmp3Reg, tmp3Reg);
8307       vpxor(tmp4Reg, tmp4Reg);
8308       movdl(tmp1Reg, tmp5);
8309       pshufd(tmp1Reg, tmp1Reg, 0);
8310     }
8311     subptr(len, 8);
8312     jccb(Assembler::greater, L_copy_8_chars_exit);
8313 
8314     bind(L_copy_8_chars);
8315     movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
8316     ptest(tmp3Reg, tmp1Reg);
8317     jccb(Assembler::notZero, L_copy_8_chars_exit);
8318     packuswb(tmp3Reg, tmp1Reg);
8319     movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
8320     addptr(len, 8);
8321     jccb(Assembler::lessEqual, L_copy_8_chars);
8322 
8323     bind(L_copy_8_chars_exit);
8324     subptr(len, 8);
8325     jccb(Assembler::zero, L_done);
8326   }
8327 
8328   bind(L_copy_1_char);
8329   load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
8330   testl(tmp5, 0xff00);      // check if Unicode char
8331   jccb(Assembler::notZero, L_copy_1_char_exit);
8332   movb(Address(dst, len, Address::times_1, 0), tmp5);
8333   addptr(len, 1);
8334   jccb(Assembler::less, L_copy_1_char);
8335 
8336   bind(L_copy_1_char_exit);
8337   addptr(result, len); // len is negative count of not processed elements
8338 
8339   bind(L_done);
8340 }
8341 
8342 #ifdef _LP64
8343 /**
8344  * Helper for multiply_to_len().
8345  */
8346 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
8347   addq(dest_lo, src1);
8348   adcq(dest_hi, 0);
8349   addq(dest_lo, src2);
8350   adcq(dest_hi, 0);
8351 }
8352 
8353 /**
8354  * Multiply 64 bit by 64 bit first loop.
8355  */
8356 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
8357                                            Register y, Register y_idx, Register z,
8358                                            Register carry, Register product,
8359                                            Register idx, Register kdx) {
8360   //
8361   //  jlong carry, x[], y[], z[];
8362   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
8363   //    huge_128 product = y[idx] * x[xstart] + carry;
8364   //    z[kdx] = (jlong)product;
8365   //    carry  = (jlong)(product >>> 64);
8366   //  }
8367   //  z[xstart] = carry;
8368   //
8369 
8370   Label L_first_loop, L_first_loop_exit;
8371   Label L_one_x, L_one_y, L_multiply;
8372 
8373   decrementl(xstart);
8374   jcc(Assembler::negative, L_one_x);
8375 
8376   movq(x_xstart, Address(x, xstart, Address::times_4,  0));
8377   rorq(x_xstart, 32); // convert big-endian to little-endian
8378 
8379   bind(L_first_loop);
8380   decrementl(idx);
8381   jcc(Assembler::negative, L_first_loop_exit);
8382   decrementl(idx);
8383   jcc(Assembler::negative, L_one_y);
8384   movq(y_idx, Address(y, idx, Address::times_4,  0));
8385   rorq(y_idx, 32); // convert big-endian to little-endian
8386   bind(L_multiply);
8387   movq(product, x_xstart);
8388   mulq(y_idx); // product(rax) * y_idx -> rdx:rax
8389   addq(product, carry);
8390   adcq(rdx, 0);
8391   subl(kdx, 2);
8392   movl(Address(z, kdx, Address::times_4,  4), product);
8393   shrq(product, 32);
8394   movl(Address(z, kdx, Address::times_4,  0), product);
8395   movq(carry, rdx);
8396   jmp(L_first_loop);
8397 
8398   bind(L_one_y);
8399   movl(y_idx, Address(y,  0));
8400   jmp(L_multiply);
8401 
8402   bind(L_one_x);
8403   movl(x_xstart, Address(x,  0));
8404   jmp(L_first_loop);
8405 
8406   bind(L_first_loop_exit);
8407 }
8408 
8409 /**
8410  * Multiply 64 bit by 64 bit and add 128 bit.
8411  */
8412 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
8413                                             Register yz_idx, Register idx,
8414                                             Register carry, Register product, int offset) {
8415   //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
8416   //     z[kdx] = (jlong)product;
8417 
8418   movq(yz_idx, Address(y, idx, Address::times_4,  offset));
8419   rorq(yz_idx, 32); // convert big-endian to little-endian
8420   movq(product, x_xstart);
8421   mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
8422   movq(yz_idx, Address(z, idx, Address::times_4,  offset));
8423   rorq(yz_idx, 32); // convert big-endian to little-endian
8424 
8425   add2_with_carry(rdx, product, carry, yz_idx);
8426 
8427   movl(Address(z, idx, Address::times_4,  offset+4), product);
8428   shrq(product, 32);
8429   movl(Address(z, idx, Address::times_4,  offset), product);
8430 
8431 }
8432 
8433 /**
8434  * Multiply 128 bit by 128 bit. Unrolled inner loop.
8435  */
8436 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
8437                                              Register yz_idx, Register idx, Register jdx,
8438                                              Register carry, Register product,
8439                                              Register carry2) {
8440   //   jlong carry, x[], y[], z[];
8441   //   int kdx = ystart+1;
8442   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
8443   //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
8444   //     z[kdx+idx+1] = (jlong)product;
8445   //     jlong carry2  = (jlong)(product >>> 64);
8446   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
8447   //     z[kdx+idx] = (jlong)product;
8448   //     carry  = (jlong)(product >>> 64);
8449   //   }
8450   //   idx += 2;
8451   //   if (idx > 0) {
8452   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
8453   //     z[kdx+idx] = (jlong)product;
8454   //     carry  = (jlong)(product >>> 64);
8455   //   }
8456   //
8457 
8458   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
8459 
8460   movl(jdx, idx);
8461   andl(jdx, 0xFFFFFFFC);
8462   shrl(jdx, 2);
8463 
8464   bind(L_third_loop);
8465   subl(jdx, 1);
8466   jcc(Assembler::negative, L_third_loop_exit);
8467   subl(idx, 4);
8468 
8469   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
8470   movq(carry2, rdx);
8471 
8472   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
8473   movq(carry, rdx);
8474   jmp(L_third_loop);
8475 
8476   bind (L_third_loop_exit);
8477 
8478   andl (idx, 0x3);
8479   jcc(Assembler::zero, L_post_third_loop_done);
8480 
8481   Label L_check_1;
8482   subl(idx, 2);
8483   jcc(Assembler::negative, L_check_1);
8484 
8485   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
8486   movq(carry, rdx);
8487 
8488   bind (L_check_1);
8489   addl (idx, 0x2);
8490   andl (idx, 0x1);
8491   subl(idx, 1);
8492   jcc(Assembler::negative, L_post_third_loop_done);
8493 
8494   movl(yz_idx, Address(y, idx, Address::times_4,  0));
8495   movq(product, x_xstart);
8496   mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
8497   movl(yz_idx, Address(z, idx, Address::times_4,  0));
8498 
8499   add2_with_carry(rdx, product, yz_idx, carry);
8500 
8501   movl(Address(z, idx, Address::times_4,  0), product);
8502   shrq(product, 32);
8503 
8504   shlq(rdx, 32);
8505   orq(product, rdx);
8506   movq(carry, product);
8507 
8508   bind(L_post_third_loop_done);
8509 }
8510 
8511 /**
8512  * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
8513  *
8514  */
8515 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
8516                                                   Register carry, Register carry2,
8517                                                   Register idx, Register jdx,
8518                                                   Register yz_idx1, Register yz_idx2,
8519                                                   Register tmp, Register tmp3, Register tmp4) {
8520   assert(UseBMI2Instructions, "should be used only when BMI2 is available");
8521 
8522   //   jlong carry, x[], y[], z[];
8523   //   int kdx = ystart+1;
8524   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
8525   //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
8526   //     jlong carry2  = (jlong)(tmp3 >>> 64);
8527   //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
8528   //     carry  = (jlong)(tmp4 >>> 64);
8529   //     z[kdx+idx+1] = (jlong)tmp3;
8530   //     z[kdx+idx] = (jlong)tmp4;
8531   //   }
8532   //   idx += 2;
8533   //   if (idx > 0) {
8534   //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
8535   //     z[kdx+idx] = (jlong)yz_idx1;
8536   //     carry  = (jlong)(yz_idx1 >>> 64);
8537   //   }
8538   //
8539 
8540   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
8541 
8542   movl(jdx, idx);
8543   andl(jdx, 0xFFFFFFFC);
8544   shrl(jdx, 2);
8545 
8546   bind(L_third_loop);
8547   subl(jdx, 1);
8548   jcc(Assembler::negative, L_third_loop_exit);
8549   subl(idx, 4);
8550 
8551   movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
8552   rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
8553   movq(yz_idx2, Address(y, idx, Address::times_4,  0));
8554   rorxq(yz_idx2, yz_idx2, 32);
8555 
8556   mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
8557   mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
8558 
8559   movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
8560   rorxq(yz_idx1, yz_idx1, 32);
8561   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
8562   rorxq(yz_idx2, yz_idx2, 32);
8563 
8564   if (VM_Version::supports_adx()) {
8565     adcxq(tmp3, carry);
8566     adoxq(tmp3, yz_idx1);
8567 
8568     adcxq(tmp4, tmp);
8569     adoxq(tmp4, yz_idx2);
8570 
8571     movl(carry, 0); // does not affect flags
8572     adcxq(carry2, carry);
8573     adoxq(carry2, carry);
8574   } else {
8575     add2_with_carry(tmp4, tmp3, carry, yz_idx1);
8576     add2_with_carry(carry2, tmp4, tmp, yz_idx2);
8577   }
8578   movq(carry, carry2);
8579 
8580   movl(Address(z, idx, Address::times_4, 12), tmp3);
8581   shrq(tmp3, 32);
8582   movl(Address(z, idx, Address::times_4,  8), tmp3);
8583 
8584   movl(Address(z, idx, Address::times_4,  4), tmp4);
8585   shrq(tmp4, 32);
8586   movl(Address(z, idx, Address::times_4,  0), tmp4);
8587 
8588   jmp(L_third_loop);
8589 
8590   bind (L_third_loop_exit);
8591 
8592   andl (idx, 0x3);
8593   jcc(Assembler::zero, L_post_third_loop_done);
8594 
8595   Label L_check_1;
8596   subl(idx, 2);
8597   jcc(Assembler::negative, L_check_1);
8598 
8599   movq(yz_idx1, Address(y, idx, Address::times_4,  0));
8600   rorxq(yz_idx1, yz_idx1, 32);
8601   mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
8602   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
8603   rorxq(yz_idx2, yz_idx2, 32);
8604 
8605   add2_with_carry(tmp4, tmp3, carry, yz_idx2);
8606 
8607   movl(Address(z, idx, Address::times_4,  4), tmp3);
8608   shrq(tmp3, 32);
8609   movl(Address(z, idx, Address::times_4,  0), tmp3);
8610   movq(carry, tmp4);
8611 
8612   bind (L_check_1);
8613   addl (idx, 0x2);
8614   andl (idx, 0x1);
8615   subl(idx, 1);
8616   jcc(Assembler::negative, L_post_third_loop_done);
8617   movl(tmp4, Address(y, idx, Address::times_4,  0));
8618   mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
8619   movl(tmp4, Address(z, idx, Address::times_4,  0));
8620 
8621   add2_with_carry(carry2, tmp3, tmp4, carry);
8622 
8623   movl(Address(z, idx, Address::times_4,  0), tmp3);
8624   shrq(tmp3, 32);
8625 
8626   shlq(carry2, 32);
8627   orq(tmp3, carry2);
8628   movq(carry, tmp3);
8629 
8630   bind(L_post_third_loop_done);
8631 }
8632 
8633 /**
8634  * Code for BigInteger::multiplyToLen() instrinsic.
8635  *
8636  * rdi: x
8637  * rax: xlen
8638  * rsi: y
8639  * rcx: ylen
8640  * r8:  z
8641  * r11: zlen
8642  * r12: tmp1
8643  * r13: tmp2
8644  * r14: tmp3
8645  * r15: tmp4
8646  * rbx: tmp5
8647  *
8648  */
8649 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
8650                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
8651   ShortBranchVerifier sbv(this);
8652   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
8653 
8654   push(tmp1);
8655   push(tmp2);
8656   push(tmp3);
8657   push(tmp4);
8658   push(tmp5);
8659 
8660   push(xlen);
8661   push(zlen);
8662 
8663   const Register idx = tmp1;
8664   const Register kdx = tmp2;
8665   const Register xstart = tmp3;
8666 
8667   const Register y_idx = tmp4;
8668   const Register carry = tmp5;
8669   const Register product  = xlen;
8670   const Register x_xstart = zlen;  // reuse register
8671 
8672   // First Loop.
8673   //
8674   //  final static long LONG_MASK = 0xffffffffL;
8675   //  int xstart = xlen - 1;
8676   //  int ystart = ylen - 1;
8677   //  long carry = 0;
8678   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
8679   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
8680   //    z[kdx] = (int)product;
8681   //    carry = product >>> 32;
8682   //  }
8683   //  z[xstart] = (int)carry;
8684   //
8685 
8686   movl(idx, ylen);      // idx = ylen;
8687   movl(kdx, zlen);      // kdx = xlen+ylen;
8688   xorq(carry, carry);   // carry = 0;
8689 
8690   Label L_done;
8691 
8692   movl(xstart, xlen);
8693   decrementl(xstart);
8694   jcc(Assembler::negative, L_done);
8695 
8696   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
8697 
8698   Label L_second_loop;
8699   testl(kdx, kdx);
8700   jcc(Assembler::zero, L_second_loop);
8701 
8702   Label L_carry;
8703   subl(kdx, 1);
8704   jcc(Assembler::zero, L_carry);
8705 
8706   movl(Address(z, kdx, Address::times_4,  0), carry);
8707   shrq(carry, 32);
8708   subl(kdx, 1);
8709 
8710   bind(L_carry);
8711   movl(Address(z, kdx, Address::times_4,  0), carry);
8712 
8713   // Second and third (nested) loops.
8714   //
8715   // for (int i = xstart-1; i >= 0; i--) { // Second loop
8716   //   carry = 0;
8717   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
8718   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
8719   //                    (z[k] & LONG_MASK) + carry;
8720   //     z[k] = (int)product;
8721   //     carry = product >>> 32;
8722   //   }
8723   //   z[i] = (int)carry;
8724   // }
8725   //
8726   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
8727 
8728   const Register jdx = tmp1;
8729 
8730   bind(L_second_loop);
8731   xorl(carry, carry);    // carry = 0;
8732   movl(jdx, ylen);       // j = ystart+1
8733 
8734   subl(xstart, 1);       // i = xstart-1;
8735   jcc(Assembler::negative, L_done);
8736 
8737   push (z);
8738 
8739   Label L_last_x;
8740   lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
8741   subl(xstart, 1);       // i = xstart-1;
8742   jcc(Assembler::negative, L_last_x);
8743 
8744   if (UseBMI2Instructions) {
8745     movq(rdx,  Address(x, xstart, Address::times_4,  0));
8746     rorxq(rdx, rdx, 32); // convert big-endian to little-endian
8747   } else {
8748     movq(x_xstart, Address(x, xstart, Address::times_4,  0));
8749     rorq(x_xstart, 32);  // convert big-endian to little-endian
8750   }
8751 
8752   Label L_third_loop_prologue;
8753   bind(L_third_loop_prologue);
8754 
8755   push (x);
8756   push (xstart);
8757   push (ylen);
8758 
8759 
8760   if (UseBMI2Instructions) {
8761     multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
8762   } else { // !UseBMI2Instructions
8763     multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
8764   }
8765 
8766   pop(ylen);
8767   pop(xlen);
8768   pop(x);
8769   pop(z);
8770 
8771   movl(tmp3, xlen);
8772   addl(tmp3, 1);
8773   movl(Address(z, tmp3, Address::times_4,  0), carry);
8774   subl(tmp3, 1);
8775   jccb(Assembler::negative, L_done);
8776 
8777   shrq(carry, 32);
8778   movl(Address(z, tmp3, Address::times_4,  0), carry);
8779   jmp(L_second_loop);
8780 
8781   // Next infrequent code is moved outside loops.
8782   bind(L_last_x);
8783   if (UseBMI2Instructions) {
8784     movl(rdx, Address(x,  0));
8785   } else {
8786     movl(x_xstart, Address(x,  0));
8787   }
8788   jmp(L_third_loop_prologue);
8789 
8790   bind(L_done);
8791 
8792   pop(zlen);
8793   pop(xlen);
8794 
8795   pop(tmp5);
8796   pop(tmp4);
8797   pop(tmp3);
8798   pop(tmp2);
8799   pop(tmp1);
8800 }
8801 
8802 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
8803   Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
8804   assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
8805   Label VECTOR64_LOOP, VECTOR64_TAIL, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
8806   Label VECTOR32_LOOP, VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
8807   Label VECTOR16_TAIL, VECTOR8_TAIL, VECTOR4_TAIL;
8808   Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
8809   Label SAME_TILL_END, DONE;
8810   Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
8811 
8812   //scale is in rcx in both Win64 and Unix
8813   ShortBranchVerifier sbv(this);
8814 
8815   shlq(length);
8816   xorq(result, result);
8817 
8818   if ((UseAVX > 2) &&
8819       VM_Version::supports_avx512vlbw()) {
8820     set_vector_masking();  // opening of the stub context for programming mask registers
8821     cmpq(length, 64);
8822     jcc(Assembler::less, VECTOR32_TAIL);
8823     movq(tmp1, length);
8824     andq(tmp1, 0x3F);      // tail count
8825     andq(length, ~(0x3F)); //vector count
8826 
8827     bind(VECTOR64_LOOP);
8828     // AVX512 code to compare 64 byte vectors.
8829     evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
8830     evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
8831     kortestql(k7, k7);
8832     jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL);     // mismatch
8833     addq(result, 64);
8834     subq(length, 64);
8835     jccb(Assembler::notZero, VECTOR64_LOOP);
8836 
8837     //bind(VECTOR64_TAIL);
8838     testq(tmp1, tmp1);
8839     jcc(Assembler::zero, SAME_TILL_END);
8840 
8841     bind(VECTOR64_TAIL);
8842     // AVX512 code to compare upto 63 byte vectors.
8843     // Save k1
8844     kmovql(k3, k1);
8845     mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
8846     shlxq(tmp2, tmp2, tmp1);
8847     notq(tmp2);
8848     kmovql(k1, tmp2);
8849 
8850     evmovdqub(rymm0, k1, Address(obja, result), Assembler::AVX_512bit);
8851     evpcmpeqb(k7, k1, rymm0, Address(objb, result), Assembler::AVX_512bit);
8852 
8853     ktestql(k7, k1);
8854     // Restore k1
8855     kmovql(k1, k3);
8856     jcc(Assembler::below, SAME_TILL_END);     // not mismatch
8857 
8858     bind(VECTOR64_NOT_EQUAL);
8859     kmovql(tmp1, k7);
8860     notq(tmp1);
8861     tzcntq(tmp1, tmp1);
8862     addq(result, tmp1);
8863     shrq(result);
8864     jmp(DONE);
8865     bind(VECTOR32_TAIL);
8866     clear_vector_masking();   // closing of the stub context for programming mask registers
8867   }
8868 
8869   cmpq(length, 8);
8870   jcc(Assembler::equal, VECTOR8_LOOP);
8871   jcc(Assembler::less, VECTOR4_TAIL);
8872 
8873   if (UseAVX >= 2) {
8874 
8875     cmpq(length, 16);
8876     jcc(Assembler::equal, VECTOR16_LOOP);
8877     jcc(Assembler::less, VECTOR8_LOOP);
8878 
8879     cmpq(length, 32);
8880     jccb(Assembler::less, VECTOR16_TAIL);
8881 
8882     subq(length, 32);
8883     bind(VECTOR32_LOOP);
8884     vmovdqu(rymm0, Address(obja, result));
8885     vmovdqu(rymm1, Address(objb, result));
8886     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
8887     vptest(rymm2, rymm2);
8888     jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
8889     addq(result, 32);
8890     subq(length, 32);
8891     jccb(Assembler::greaterEqual, VECTOR32_LOOP);
8892     addq(length, 32);
8893     jcc(Assembler::equal, SAME_TILL_END);
8894     //falling through if less than 32 bytes left //close the branch here.
8895 
8896     bind(VECTOR16_TAIL);
8897     cmpq(length, 16);
8898     jccb(Assembler::less, VECTOR8_TAIL);
8899     bind(VECTOR16_LOOP);
8900     movdqu(rymm0, Address(obja, result));
8901     movdqu(rymm1, Address(objb, result));
8902     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
8903     ptest(rymm2, rymm2);
8904     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
8905     addq(result, 16);
8906     subq(length, 16);
8907     jcc(Assembler::equal, SAME_TILL_END);
8908     //falling through if less than 16 bytes left
8909   } else {//regular intrinsics
8910 
8911     cmpq(length, 16);
8912     jccb(Assembler::less, VECTOR8_TAIL);
8913 
8914     subq(length, 16);
8915     bind(VECTOR16_LOOP);
8916     movdqu(rymm0, Address(obja, result));
8917     movdqu(rymm1, Address(objb, result));
8918     pxor(rymm0, rymm1);
8919     ptest(rymm0, rymm0);
8920     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
8921     addq(result, 16);
8922     subq(length, 16);
8923     jccb(Assembler::greaterEqual, VECTOR16_LOOP);
8924     addq(length, 16);
8925     jcc(Assembler::equal, SAME_TILL_END);
8926     //falling through if less than 16 bytes left
8927   }
8928 
8929   bind(VECTOR8_TAIL);
8930   cmpq(length, 8);
8931   jccb(Assembler::less, VECTOR4_TAIL);
8932   bind(VECTOR8_LOOP);
8933   movq(tmp1, Address(obja, result));
8934   movq(tmp2, Address(objb, result));
8935   xorq(tmp1, tmp2);
8936   testq(tmp1, tmp1);
8937   jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
8938   addq(result, 8);
8939   subq(length, 8);
8940   jcc(Assembler::equal, SAME_TILL_END);
8941   //falling through if less than 8 bytes left
8942 
8943   bind(VECTOR4_TAIL);
8944   cmpq(length, 4);
8945   jccb(Assembler::less, BYTES_TAIL);
8946   bind(VECTOR4_LOOP);
8947   movl(tmp1, Address(obja, result));
8948   xorl(tmp1, Address(objb, result));
8949   testl(tmp1, tmp1);
8950   jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
8951   addq(result, 4);
8952   subq(length, 4);
8953   jcc(Assembler::equal, SAME_TILL_END);
8954   //falling through if less than 4 bytes left
8955 
8956   bind(BYTES_TAIL);
8957   bind(BYTES_LOOP);
8958   load_unsigned_byte(tmp1, Address(obja, result));
8959   load_unsigned_byte(tmp2, Address(objb, result));
8960   xorl(tmp1, tmp2);
8961   testl(tmp1, tmp1);
8962   jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
8963   decq(length);
8964   jccb(Assembler::zero, SAME_TILL_END);
8965   incq(result);
8966   load_unsigned_byte(tmp1, Address(obja, result));
8967   load_unsigned_byte(tmp2, Address(objb, result));
8968   xorl(tmp1, tmp2);
8969   testl(tmp1, tmp1);
8970   jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
8971   decq(length);
8972   jccb(Assembler::zero, SAME_TILL_END);
8973   incq(result);
8974   load_unsigned_byte(tmp1, Address(obja, result));
8975   load_unsigned_byte(tmp2, Address(objb, result));
8976   xorl(tmp1, tmp2);
8977   testl(tmp1, tmp1);
8978   jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
8979   jmpb(SAME_TILL_END);
8980 
8981   if (UseAVX >= 2) {
8982     bind(VECTOR32_NOT_EQUAL);
8983     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
8984     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
8985     vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
8986     vpmovmskb(tmp1, rymm0);
8987     bsfq(tmp1, tmp1);
8988     addq(result, tmp1);
8989     shrq(result);
8990     jmpb(DONE);
8991   }
8992 
8993   bind(VECTOR16_NOT_EQUAL);
8994   if (UseAVX >= 2) {
8995     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
8996     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
8997     pxor(rymm0, rymm2);
8998   } else {
8999     pcmpeqb(rymm2, rymm2);
9000     pxor(rymm0, rymm1);
9001     pcmpeqb(rymm0, rymm1);
9002     pxor(rymm0, rymm2);
9003   }
9004   pmovmskb(tmp1, rymm0);
9005   bsfq(tmp1, tmp1);
9006   addq(result, tmp1);
9007   shrq(result);
9008   jmpb(DONE);
9009 
9010   bind(VECTOR8_NOT_EQUAL);
9011   bind(VECTOR4_NOT_EQUAL);
9012   bsfq(tmp1, tmp1);
9013   shrq(tmp1, 3);
9014   addq(result, tmp1);
9015   bind(BYTES_NOT_EQUAL);
9016   shrq(result);
9017   jmpb(DONE);
9018 
9019   bind(SAME_TILL_END);
9020   mov64(result, -1);
9021 
9022   bind(DONE);
9023 }
9024 
9025 //Helper functions for square_to_len()
9026 
9027 /**
9028  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
9029  * Preserves x and z and modifies rest of the registers.
9030  */
9031 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
9032   // Perform square and right shift by 1
9033   // Handle odd xlen case first, then for even xlen do the following
9034   // jlong carry = 0;
9035   // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
9036   //     huge_128 product = x[j:j+1] * x[j:j+1];
9037   //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
9038   //     z[i+2:i+3] = (jlong)(product >>> 1);
9039   //     carry = (jlong)product;
9040   // }
9041 
9042   xorq(tmp5, tmp5);     // carry
9043   xorq(rdxReg, rdxReg);
9044   xorl(tmp1, tmp1);     // index for x
9045   xorl(tmp4, tmp4);     // index for z
9046 
9047   Label L_first_loop, L_first_loop_exit;
9048 
9049   testl(xlen, 1);
9050   jccb(Assembler::zero, L_first_loop); //jump if xlen is even
9051 
9052   // Square and right shift by 1 the odd element using 32 bit multiply
9053   movl(raxReg, Address(x, tmp1, Address::times_4, 0));
9054   imulq(raxReg, raxReg);
9055   shrq(raxReg, 1);
9056   adcq(tmp5, 0);
9057   movq(Address(z, tmp4, Address::times_4, 0), raxReg);
9058   incrementl(tmp1);
9059   addl(tmp4, 2);
9060 
9061   // Square and  right shift by 1 the rest using 64 bit multiply
9062   bind(L_first_loop);
9063   cmpptr(tmp1, xlen);
9064   jccb(Assembler::equal, L_first_loop_exit);
9065 
9066   // Square
9067   movq(raxReg, Address(x, tmp1, Address::times_4,  0));
9068   rorq(raxReg, 32);    // convert big-endian to little-endian
9069   mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
9070 
9071   // Right shift by 1 and save carry
9072   shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
9073   rcrq(rdxReg, 1);
9074   rcrq(raxReg, 1);
9075   adcq(tmp5, 0);
9076 
9077   // Store result in z
9078   movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
9079   movq(Address(z, tmp4, Address::times_4, 8), raxReg);
9080 
9081   // Update indices for x and z
9082   addl(tmp1, 2);
9083   addl(tmp4, 4);
9084   jmp(L_first_loop);
9085 
9086   bind(L_first_loop_exit);
9087 }
9088 
9089 
9090 /**
9091  * Perform the following multiply add operation using BMI2 instructions
9092  * carry:sum = sum + op1*op2 + carry
9093  * op2 should be in rdx
9094  * op2 is preserved, all other registers are modified
9095  */
9096 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
9097   // assert op2 is rdx
9098   mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
9099   addq(sum, carry);
9100   adcq(tmp2, 0);
9101   addq(sum, op1);
9102   adcq(tmp2, 0);
9103   movq(carry, tmp2);
9104 }
9105 
9106 /**
9107  * Perform the following multiply add operation:
9108  * carry:sum = sum + op1*op2 + carry
9109  * Preserves op1, op2 and modifies rest of registers
9110  */
9111 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
9112   // rdx:rax = op1 * op2
9113   movq(raxReg, op2);
9114   mulq(op1);
9115 
9116   //  rdx:rax = sum + carry + rdx:rax
9117   addq(sum, carry);
9118   adcq(rdxReg, 0);
9119   addq(sum, raxReg);
9120   adcq(rdxReg, 0);
9121 
9122   // carry:sum = rdx:sum
9123   movq(carry, rdxReg);
9124 }
9125 
9126 /**
9127  * Add 64 bit long carry into z[] with carry propogation.
9128  * Preserves z and carry register values and modifies rest of registers.
9129  *
9130  */
9131 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
9132   Label L_fourth_loop, L_fourth_loop_exit;
9133 
9134   movl(tmp1, 1);
9135   subl(zlen, 2);
9136   addq(Address(z, zlen, Address::times_4, 0), carry);
9137 
9138   bind(L_fourth_loop);
9139   jccb(Assembler::carryClear, L_fourth_loop_exit);
9140   subl(zlen, 2);
9141   jccb(Assembler::negative, L_fourth_loop_exit);
9142   addq(Address(z, zlen, Address::times_4, 0), tmp1);
9143   jmp(L_fourth_loop);
9144   bind(L_fourth_loop_exit);
9145 }
9146 
9147 /**
9148  * Shift z[] left by 1 bit.
9149  * Preserves x, len, z and zlen registers and modifies rest of the registers.
9150  *
9151  */
9152 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
9153 
9154   Label L_fifth_loop, L_fifth_loop_exit;
9155 
9156   // Fifth loop
9157   // Perform primitiveLeftShift(z, zlen, 1)
9158 
9159   const Register prev_carry = tmp1;
9160   const Register new_carry = tmp4;
9161   const Register value = tmp2;
9162   const Register zidx = tmp3;
9163 
9164   // int zidx, carry;
9165   // long value;
9166   // carry = 0;
9167   // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
9168   //    (carry:value)  = (z[i] << 1) | carry ;
9169   //    z[i] = value;
9170   // }
9171 
9172   movl(zidx, zlen);
9173   xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
9174 
9175   bind(L_fifth_loop);
9176   decl(zidx);  // Use decl to preserve carry flag
9177   decl(zidx);
9178   jccb(Assembler::negative, L_fifth_loop_exit);
9179 
9180   if (UseBMI2Instructions) {
9181      movq(value, Address(z, zidx, Address::times_4, 0));
9182      rclq(value, 1);
9183      rorxq(value, value, 32);
9184      movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
9185   }
9186   else {
9187     // clear new_carry
9188     xorl(new_carry, new_carry);
9189 
9190     // Shift z[i] by 1, or in previous carry and save new carry
9191     movq(value, Address(z, zidx, Address::times_4, 0));
9192     shlq(value, 1);
9193     adcl(new_carry, 0);
9194 
9195     orq(value, prev_carry);
9196     rorq(value, 0x20);
9197     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
9198 
9199     // Set previous carry = new carry
9200     movl(prev_carry, new_carry);
9201   }
9202   jmp(L_fifth_loop);
9203 
9204   bind(L_fifth_loop_exit);
9205 }
9206 
9207 
9208 /**
9209  * Code for BigInteger::squareToLen() intrinsic
9210  *
9211  * rdi: x
9212  * rsi: len
9213  * r8:  z
9214  * rcx: zlen
9215  * r12: tmp1
9216  * r13: tmp2
9217  * r14: tmp3
9218  * r15: tmp4
9219  * rbx: tmp5
9220  *
9221  */
9222 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
9223 
9224   Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply;
9225   push(tmp1);
9226   push(tmp2);
9227   push(tmp3);
9228   push(tmp4);
9229   push(tmp5);
9230 
9231   // First loop
9232   // Store the squares, right shifted one bit (i.e., divided by 2).
9233   square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
9234 
9235   // Add in off-diagonal sums.
9236   //
9237   // Second, third (nested) and fourth loops.
9238   // zlen +=2;
9239   // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
9240   //    carry = 0;
9241   //    long op2 = x[xidx:xidx+1];
9242   //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
9243   //       k -= 2;
9244   //       long op1 = x[j:j+1];
9245   //       long sum = z[k:k+1];
9246   //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
9247   //       z[k:k+1] = sum;
9248   //    }
9249   //    add_one_64(z, k, carry, tmp_regs);
9250   // }
9251 
9252   const Register carry = tmp5;
9253   const Register sum = tmp3;
9254   const Register op1 = tmp4;
9255   Register op2 = tmp2;
9256 
9257   push(zlen);
9258   push(len);
9259   addl(zlen,2);
9260   bind(L_second_loop);
9261   xorq(carry, carry);
9262   subl(zlen, 4);
9263   subl(len, 2);
9264   push(zlen);
9265   push(len);
9266   cmpl(len, 0);
9267   jccb(Assembler::lessEqual, L_second_loop_exit);
9268 
9269   // Multiply an array by one 64 bit long.
9270   if (UseBMI2Instructions) {
9271     op2 = rdxReg;
9272     movq(op2, Address(x, len, Address::times_4,  0));
9273     rorxq(op2, op2, 32);
9274   }
9275   else {
9276     movq(op2, Address(x, len, Address::times_4,  0));
9277     rorq(op2, 32);
9278   }
9279 
9280   bind(L_third_loop);
9281   decrementl(len);
9282   jccb(Assembler::negative, L_third_loop_exit);
9283   decrementl(len);
9284   jccb(Assembler::negative, L_last_x);
9285 
9286   movq(op1, Address(x, len, Address::times_4,  0));
9287   rorq(op1, 32);
9288 
9289   bind(L_multiply);
9290   subl(zlen, 2);
9291   movq(sum, Address(z, zlen, Address::times_4,  0));
9292 
9293   // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
9294   if (UseBMI2Instructions) {
9295     multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
9296   }
9297   else {
9298     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
9299   }
9300 
9301   movq(Address(z, zlen, Address::times_4, 0), sum);
9302 
9303   jmp(L_third_loop);
9304   bind(L_third_loop_exit);
9305 
9306   // Fourth loop
9307   // Add 64 bit long carry into z with carry propogation.
9308   // Uses offsetted zlen.
9309   add_one_64(z, zlen, carry, tmp1);
9310 
9311   pop(len);
9312   pop(zlen);
9313   jmp(L_second_loop);
9314 
9315   // Next infrequent code is moved outside loops.
9316   bind(L_last_x);
9317   movl(op1, Address(x, 0));
9318   jmp(L_multiply);
9319 
9320   bind(L_second_loop_exit);
9321   pop(len);
9322   pop(zlen);
9323   pop(len);
9324   pop(zlen);
9325 
9326   // Fifth loop
9327   // Shift z left 1 bit.
9328   lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
9329 
9330   // z[zlen-1] |= x[len-1] & 1;
9331   movl(tmp3, Address(x, len, Address::times_4, -4));
9332   andl(tmp3, 1);
9333   orl(Address(z, zlen, Address::times_4,  -4), tmp3);
9334 
9335   pop(tmp5);
9336   pop(tmp4);
9337   pop(tmp3);
9338   pop(tmp2);
9339   pop(tmp1);
9340 }
9341 
9342 /**
9343  * Helper function for mul_add()
9344  * Multiply the in[] by int k and add to out[] starting at offset offs using
9345  * 128 bit by 32 bit multiply and return the carry in tmp5.
9346  * Only quad int aligned length of in[] is operated on in this function.
9347  * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
9348  * This function preserves out, in and k registers.
9349  * len and offset point to the appropriate index in "in" & "out" correspondingly
9350  * tmp5 has the carry.
9351  * other registers are temporary and are modified.
9352  *
9353  */
9354 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
9355   Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
9356   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
9357 
9358   Label L_first_loop, L_first_loop_exit;
9359 
9360   movl(tmp1, len);
9361   shrl(tmp1, 2);
9362 
9363   bind(L_first_loop);
9364   subl(tmp1, 1);
9365   jccb(Assembler::negative, L_first_loop_exit);
9366 
9367   subl(len, 4);
9368   subl(offset, 4);
9369 
9370   Register op2 = tmp2;
9371   const Register sum = tmp3;
9372   const Register op1 = tmp4;
9373   const Register carry = tmp5;
9374 
9375   if (UseBMI2Instructions) {
9376     op2 = rdxReg;
9377   }
9378 
9379   movq(op1, Address(in, len, Address::times_4,  8));
9380   rorq(op1, 32);
9381   movq(sum, Address(out, offset, Address::times_4,  8));
9382   rorq(sum, 32);
9383   if (UseBMI2Instructions) {
9384     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
9385   }
9386   else {
9387     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
9388   }
9389   // Store back in big endian from little endian
9390   rorq(sum, 0x20);
9391   movq(Address(out, offset, Address::times_4,  8), sum);
9392 
9393   movq(op1, Address(in, len, Address::times_4,  0));
9394   rorq(op1, 32);
9395   movq(sum, Address(out, offset, Address::times_4,  0));
9396   rorq(sum, 32);
9397   if (UseBMI2Instructions) {
9398     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
9399   }
9400   else {
9401     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
9402   }
9403   // Store back in big endian from little endian
9404   rorq(sum, 0x20);
9405   movq(Address(out, offset, Address::times_4,  0), sum);
9406 
9407   jmp(L_first_loop);
9408   bind(L_first_loop_exit);
9409 }
9410 
9411 /**
9412  * Code for BigInteger::mulAdd() intrinsic
9413  *
9414  * rdi: out
9415  * rsi: in
9416  * r11: offs (out.length - offset)
9417  * rcx: len
9418  * r8:  k
9419  * r12: tmp1
9420  * r13: tmp2
9421  * r14: tmp3
9422  * r15: tmp4
9423  * rbx: tmp5
9424  * Multiply the in[] by word k and add to out[], return the carry in rax
9425  */
9426 void MacroAssembler::mul_add(Register out, Register in, Register offs,
9427    Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
9428    Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
9429 
9430   Label L_carry, L_last_in, L_done;
9431 
9432 // carry = 0;
9433 // for (int j=len-1; j >= 0; j--) {
9434 //    long product = (in[j] & LONG_MASK) * kLong +
9435 //                   (out[offs] & LONG_MASK) + carry;
9436 //    out[offs--] = (int)product;
9437 //    carry = product >>> 32;
9438 // }
9439 //
9440   push(tmp1);
9441   push(tmp2);
9442   push(tmp3);
9443   push(tmp4);
9444   push(tmp5);
9445 
9446   Register op2 = tmp2;
9447   const Register sum = tmp3;
9448   const Register op1 = tmp4;
9449   const Register carry =  tmp5;
9450 
9451   if (UseBMI2Instructions) {
9452     op2 = rdxReg;
9453     movl(op2, k);
9454   }
9455   else {
9456     movl(op2, k);
9457   }
9458 
9459   xorq(carry, carry);
9460 
9461   //First loop
9462 
9463   //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
9464   //The carry is in tmp5
9465   mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
9466 
9467   //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
9468   decrementl(len);
9469   jccb(Assembler::negative, L_carry);
9470   decrementl(len);
9471   jccb(Assembler::negative, L_last_in);
9472 
9473   movq(op1, Address(in, len, Address::times_4,  0));
9474   rorq(op1, 32);
9475 
9476   subl(offs, 2);
9477   movq(sum, Address(out, offs, Address::times_4,  0));
9478   rorq(sum, 32);
9479 
9480   if (UseBMI2Instructions) {
9481     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
9482   }
9483   else {
9484     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
9485   }
9486 
9487   // Store back in big endian from little endian
9488   rorq(sum, 0x20);
9489   movq(Address(out, offs, Address::times_4,  0), sum);
9490 
9491   testl(len, len);
9492   jccb(Assembler::zero, L_carry);
9493 
9494   //Multiply the last in[] entry, if any
9495   bind(L_last_in);
9496   movl(op1, Address(in, 0));
9497   movl(sum, Address(out, offs, Address::times_4,  -4));
9498 
9499   movl(raxReg, k);
9500   mull(op1); //tmp4 * eax -> edx:eax
9501   addl(sum, carry);
9502   adcl(rdxReg, 0);
9503   addl(sum, raxReg);
9504   adcl(rdxReg, 0);
9505   movl(carry, rdxReg);
9506 
9507   movl(Address(out, offs, Address::times_4,  -4), sum);
9508 
9509   bind(L_carry);
9510   //return tmp5/carry as carry in rax
9511   movl(rax, carry);
9512 
9513   bind(L_done);
9514   pop(tmp5);
9515   pop(tmp4);
9516   pop(tmp3);
9517   pop(tmp2);
9518   pop(tmp1);
9519 }
9520 #endif
9521 
9522 /**
9523  * Emits code to update CRC-32 with a byte value according to constants in table
9524  *
9525  * @param [in,out]crc   Register containing the crc.
9526  * @param [in]val       Register containing the byte to fold into the CRC.
9527  * @param [in]table     Register containing the table of crc constants.
9528  *
9529  * uint32_t crc;
9530  * val = crc_table[(val ^ crc) & 0xFF];
9531  * crc = val ^ (crc >> 8);
9532  *
9533  */
9534 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
9535   xorl(val, crc);
9536   andl(val, 0xFF);
9537   shrl(crc, 8); // unsigned shift
9538   xorl(crc, Address(table, val, Address::times_4, 0));
9539 }
9540 
9541 /**
9542 * Fold four 128-bit data chunks
9543 */
9544 void MacroAssembler::fold_128bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
9545   evpclmulhdq(xtmp, xK, xcrc, Assembler::AVX_512bit); // [123:64]
9546   evpclmulldq(xcrc, xK, xcrc, Assembler::AVX_512bit); // [63:0]
9547   evpxorq(xcrc, xcrc, Address(buf, offset), Assembler::AVX_512bit /* vector_len */);
9548   evpxorq(xcrc, xcrc, xtmp, Assembler::AVX_512bit /* vector_len */);
9549 }
9550 
9551 /**
9552  * Fold 128-bit data chunk
9553  */
9554 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
9555   if (UseAVX > 0) {
9556     vpclmulhdq(xtmp, xK, xcrc); // [123:64]
9557     vpclmulldq(xcrc, xK, xcrc); // [63:0]
9558     vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
9559     pxor(xcrc, xtmp);
9560   } else {
9561     movdqa(xtmp, xcrc);
9562     pclmulhdq(xtmp, xK);   // [123:64]
9563     pclmulldq(xcrc, xK);   // [63:0]
9564     pxor(xcrc, xtmp);
9565     movdqu(xtmp, Address(buf, offset));
9566     pxor(xcrc, xtmp);
9567   }
9568 }
9569 
9570 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
9571   if (UseAVX > 0) {
9572     vpclmulhdq(xtmp, xK, xcrc);
9573     vpclmulldq(xcrc, xK, xcrc);
9574     pxor(xcrc, xbuf);
9575     pxor(xcrc, xtmp);
9576   } else {
9577     movdqa(xtmp, xcrc);
9578     pclmulhdq(xtmp, xK);
9579     pclmulldq(xcrc, xK);
9580     pxor(xcrc, xbuf);
9581     pxor(xcrc, xtmp);
9582   }
9583 }
9584 
9585 /**
9586  * 8-bit folds to compute 32-bit CRC
9587  *
9588  * uint64_t xcrc;
9589  * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
9590  */
9591 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
9592   movdl(tmp, xcrc);
9593   andl(tmp, 0xFF);
9594   movdl(xtmp, Address(table, tmp, Address::times_4, 0));
9595   psrldq(xcrc, 1); // unsigned shift one byte
9596   pxor(xcrc, xtmp);
9597 }
9598 
9599 /**
9600  * uint32_t crc;
9601  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
9602  */
9603 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
9604   movl(tmp, crc);
9605   andl(tmp, 0xFF);
9606   shrl(crc, 8);
9607   xorl(crc, Address(table, tmp, Address::times_4, 0));
9608 }
9609 
9610 /**
9611  * @param crc   register containing existing CRC (32-bit)
9612  * @param buf   register pointing to input byte buffer (byte*)
9613  * @param len   register containing number of bytes
9614  * @param table register that will contain address of CRC table
9615  * @param tmp   scratch register
9616  */
9617 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
9618   assert_different_registers(crc, buf, len, table, tmp, rax);
9619 
9620   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
9621   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
9622 
9623   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
9624   // context for the registers used, where all instructions below are using 128-bit mode
9625   // On EVEX without VL and BW, these instructions will all be AVX.
9626   if (VM_Version::supports_avx512vlbw()) {
9627     movl(tmp, 0xffff);
9628     kmovwl(k1, tmp);
9629   }
9630 
9631   lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
9632   notl(crc); // ~crc
9633   cmpl(len, 16);
9634   jcc(Assembler::less, L_tail);
9635 
9636   // Align buffer to 16 bytes
9637   movl(tmp, buf);
9638   andl(tmp, 0xF);
9639   jccb(Assembler::zero, L_aligned);
9640   subl(tmp,  16);
9641   addl(len, tmp);
9642 
9643   align(4);
9644   BIND(L_align_loop);
9645   movsbl(rax, Address(buf, 0)); // load byte with sign extension
9646   update_byte_crc32(crc, rax, table);
9647   increment(buf);
9648   incrementl(tmp);
9649   jccb(Assembler::less, L_align_loop);
9650 
9651   BIND(L_aligned);
9652   movl(tmp, len); // save
9653   shrl(len, 4);
9654   jcc(Assembler::zero, L_tail_restore);
9655 
9656   // Fold total 512 bits of polynomial on each iteration
9657   if (VM_Version::supports_vpclmulqdq()) {
9658     Label Parallel_loop, L_No_Parallel;
9659 
9660     cmpl(len, 8);
9661     jccb(Assembler::less, L_No_Parallel);
9662 
9663     movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
9664     evmovdquq(xmm1, Address(buf, 0), Assembler::AVX_512bit);
9665     movdl(xmm5, crc);
9666     evpxorq(xmm1, xmm1, xmm5, Assembler::AVX_512bit);
9667     addptr(buf, 64);
9668     subl(len, 7);
9669     evshufi64x2(xmm0, xmm0, xmm0, 0x00, Assembler::AVX_512bit); //propagate the mask from 128 bits to 512 bits
9670 
9671     BIND(Parallel_loop);
9672     fold_128bit_crc32_avx512(xmm1, xmm0, xmm5, buf, 0);
9673     addptr(buf, 64);
9674     subl(len, 4);
9675     jcc(Assembler::greater, Parallel_loop);
9676 
9677     vextracti64x2(xmm2, xmm1, 0x01);
9678     vextracti64x2(xmm3, xmm1, 0x02);
9679     vextracti64x2(xmm4, xmm1, 0x03);
9680     jmp(L_fold_512b);
9681 
9682     BIND(L_No_Parallel);
9683   }
9684   // Fold crc into first bytes of vector
9685   movdqa(xmm1, Address(buf, 0));
9686   movdl(rax, xmm1);
9687   xorl(crc, rax);
9688   if (VM_Version::supports_sse4_1()) {
9689     pinsrd(xmm1, crc, 0);
9690   } else {
9691     pinsrw(xmm1, crc, 0);
9692     shrl(crc, 16);
9693     pinsrw(xmm1, crc, 1);
9694   }
9695   addptr(buf, 16);
9696   subl(len, 4); // len > 0
9697   jcc(Assembler::less, L_fold_tail);
9698 
9699   movdqa(xmm2, Address(buf,  0));
9700   movdqa(xmm3, Address(buf, 16));
9701   movdqa(xmm4, Address(buf, 32));
9702   addptr(buf, 48);
9703   subl(len, 3);
9704   jcc(Assembler::lessEqual, L_fold_512b);
9705 
9706   // Fold total 512 bits of polynomial on each iteration,
9707   // 128 bits per each of 4 parallel streams.
9708   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
9709 
9710   align(32);
9711   BIND(L_fold_512b_loop);
9712   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
9713   fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
9714   fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
9715   fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
9716   addptr(buf, 64);
9717   subl(len, 4);
9718   jcc(Assembler::greater, L_fold_512b_loop);
9719 
9720   // Fold 512 bits to 128 bits.
9721   BIND(L_fold_512b);
9722   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
9723   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
9724   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
9725   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
9726 
9727   // Fold the rest of 128 bits data chunks
9728   BIND(L_fold_tail);
9729   addl(len, 3);
9730   jccb(Assembler::lessEqual, L_fold_128b);
9731   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
9732 
9733   BIND(L_fold_tail_loop);
9734   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
9735   addptr(buf, 16);
9736   decrementl(len);
9737   jccb(Assembler::greater, L_fold_tail_loop);
9738 
9739   // Fold 128 bits in xmm1 down into 32 bits in crc register.
9740   BIND(L_fold_128b);
9741   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
9742   if (UseAVX > 0) {
9743     vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
9744     vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
9745     vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
9746   } else {
9747     movdqa(xmm2, xmm0);
9748     pclmulqdq(xmm2, xmm1, 0x1);
9749     movdqa(xmm3, xmm0);
9750     pand(xmm3, xmm2);
9751     pclmulqdq(xmm0, xmm3, 0x1);
9752   }
9753   psrldq(xmm1, 8);
9754   psrldq(xmm2, 4);
9755   pxor(xmm0, xmm1);
9756   pxor(xmm0, xmm2);
9757 
9758   // 8 8-bit folds to compute 32-bit CRC.
9759   for (int j = 0; j < 4; j++) {
9760     fold_8bit_crc32(xmm0, table, xmm1, rax);
9761   }
9762   movdl(crc, xmm0); // mov 32 bits to general register
9763   for (int j = 0; j < 4; j++) {
9764     fold_8bit_crc32(crc, table, rax);
9765   }
9766 
9767   BIND(L_tail_restore);
9768   movl(len, tmp); // restore
9769   BIND(L_tail);
9770   andl(len, 0xf);
9771   jccb(Assembler::zero, L_exit);
9772 
9773   // Fold the rest of bytes
9774   align(4);
9775   BIND(L_tail_loop);
9776   movsbl(rax, Address(buf, 0)); // load byte with sign extension
9777   update_byte_crc32(crc, rax, table);
9778   increment(buf);
9779   decrementl(len);
9780   jccb(Assembler::greater, L_tail_loop);
9781 
9782   BIND(L_exit);
9783   notl(crc); // ~c
9784 }
9785 
9786 #ifdef _LP64
9787 // S. Gueron / Information Processing Letters 112 (2012) 184
9788 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
9789 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
9790 // Output: the 64-bit carry-less product of B * CONST
9791 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
9792                                      Register tmp1, Register tmp2, Register tmp3) {
9793   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
9794   if (n > 0) {
9795     addq(tmp3, n * 256 * 8);
9796   }
9797   //    Q1 = TABLEExt[n][B & 0xFF];
9798   movl(tmp1, in);
9799   andl(tmp1, 0x000000FF);
9800   shll(tmp1, 3);
9801   addq(tmp1, tmp3);
9802   movq(tmp1, Address(tmp1, 0));
9803 
9804   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
9805   movl(tmp2, in);
9806   shrl(tmp2, 8);
9807   andl(tmp2, 0x000000FF);
9808   shll(tmp2, 3);
9809   addq(tmp2, tmp3);
9810   movq(tmp2, Address(tmp2, 0));
9811 
9812   shlq(tmp2, 8);
9813   xorq(tmp1, tmp2);
9814 
9815   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
9816   movl(tmp2, in);
9817   shrl(tmp2, 16);
9818   andl(tmp2, 0x000000FF);
9819   shll(tmp2, 3);
9820   addq(tmp2, tmp3);
9821   movq(tmp2, Address(tmp2, 0));
9822 
9823   shlq(tmp2, 16);
9824   xorq(tmp1, tmp2);
9825 
9826   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
9827   shrl(in, 24);
9828   andl(in, 0x000000FF);
9829   shll(in, 3);
9830   addq(in, tmp3);
9831   movq(in, Address(in, 0));
9832 
9833   shlq(in, 24);
9834   xorq(in, tmp1);
9835   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
9836 }
9837 
9838 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
9839                                       Register in_out,
9840                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
9841                                       XMMRegister w_xtmp2,
9842                                       Register tmp1,
9843                                       Register n_tmp2, Register n_tmp3) {
9844   if (is_pclmulqdq_supported) {
9845     movdl(w_xtmp1, in_out); // modified blindly
9846 
9847     movl(tmp1, const_or_pre_comp_const_index);
9848     movdl(w_xtmp2, tmp1);
9849     pclmulqdq(w_xtmp1, w_xtmp2, 0);
9850 
9851     movdq(in_out, w_xtmp1);
9852   } else {
9853     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
9854   }
9855 }
9856 
9857 // Recombination Alternative 2: No bit-reflections
9858 // T1 = (CRC_A * U1) << 1
9859 // T2 = (CRC_B * U2) << 1
9860 // C1 = T1 >> 32
9861 // C2 = T2 >> 32
9862 // T1 = T1 & 0xFFFFFFFF
9863 // T2 = T2 & 0xFFFFFFFF
9864 // T1 = CRC32(0, T1)
9865 // T2 = CRC32(0, T2)
9866 // C1 = C1 ^ T1
9867 // C2 = C2 ^ T2
9868 // CRC = C1 ^ C2 ^ CRC_C
9869 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
9870                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9871                                      Register tmp1, Register tmp2,
9872                                      Register n_tmp3) {
9873   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9874   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9875   shlq(in_out, 1);
9876   movl(tmp1, in_out);
9877   shrq(in_out, 32);
9878   xorl(tmp2, tmp2);
9879   crc32(tmp2, tmp1, 4);
9880   xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
9881   shlq(in1, 1);
9882   movl(tmp1, in1);
9883   shrq(in1, 32);
9884   xorl(tmp2, tmp2);
9885   crc32(tmp2, tmp1, 4);
9886   xorl(in1, tmp2);
9887   xorl(in_out, in1);
9888   xorl(in_out, in2);
9889 }
9890 
9891 // Set N to predefined value
9892 // Subtract from a lenght of a buffer
9893 // execute in a loop:
9894 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
9895 // for i = 1 to N do
9896 //  CRC_A = CRC32(CRC_A, A[i])
9897 //  CRC_B = CRC32(CRC_B, B[i])
9898 //  CRC_C = CRC32(CRC_C, C[i])
9899 // end for
9900 // Recombine
9901 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
9902                                        Register in_out1, Register in_out2, Register in_out3,
9903                                        Register tmp1, Register tmp2, Register tmp3,
9904                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9905                                        Register tmp4, Register tmp5,
9906                                        Register n_tmp6) {
9907   Label L_processPartitions;
9908   Label L_processPartition;
9909   Label L_exit;
9910 
9911   bind(L_processPartitions);
9912   cmpl(in_out1, 3 * size);
9913   jcc(Assembler::less, L_exit);
9914     xorl(tmp1, tmp1);
9915     xorl(tmp2, tmp2);
9916     movq(tmp3, in_out2);
9917     addq(tmp3, size);
9918 
9919     bind(L_processPartition);
9920       crc32(in_out3, Address(in_out2, 0), 8);
9921       crc32(tmp1, Address(in_out2, size), 8);
9922       crc32(tmp2, Address(in_out2, size * 2), 8);
9923       addq(in_out2, 8);
9924       cmpq(in_out2, tmp3);
9925       jcc(Assembler::less, L_processPartition);
9926     crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
9927             w_xtmp1, w_xtmp2, w_xtmp3,
9928             tmp4, tmp5,
9929             n_tmp6);
9930     addq(in_out2, 2 * size);
9931     subl(in_out1, 3 * size);
9932     jmp(L_processPartitions);
9933 
9934   bind(L_exit);
9935 }
9936 #else
9937 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
9938                                      Register tmp1, Register tmp2, Register tmp3,
9939                                      XMMRegister xtmp1, XMMRegister xtmp2) {
9940   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
9941   if (n > 0) {
9942     addl(tmp3, n * 256 * 8);
9943   }
9944   //    Q1 = TABLEExt[n][B & 0xFF];
9945   movl(tmp1, in_out);
9946   andl(tmp1, 0x000000FF);
9947   shll(tmp1, 3);
9948   addl(tmp1, tmp3);
9949   movq(xtmp1, Address(tmp1, 0));
9950 
9951   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
9952   movl(tmp2, in_out);
9953   shrl(tmp2, 8);
9954   andl(tmp2, 0x000000FF);
9955   shll(tmp2, 3);
9956   addl(tmp2, tmp3);
9957   movq(xtmp2, Address(tmp2, 0));
9958 
9959   psllq(xtmp2, 8);
9960   pxor(xtmp1, xtmp2);
9961 
9962   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
9963   movl(tmp2, in_out);
9964   shrl(tmp2, 16);
9965   andl(tmp2, 0x000000FF);
9966   shll(tmp2, 3);
9967   addl(tmp2, tmp3);
9968   movq(xtmp2, Address(tmp2, 0));
9969 
9970   psllq(xtmp2, 16);
9971   pxor(xtmp1, xtmp2);
9972 
9973   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
9974   shrl(in_out, 24);
9975   andl(in_out, 0x000000FF);
9976   shll(in_out, 3);
9977   addl(in_out, tmp3);
9978   movq(xtmp2, Address(in_out, 0));
9979 
9980   psllq(xtmp2, 24);
9981   pxor(xtmp1, xtmp2); // Result in CXMM
9982   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
9983 }
9984 
9985 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
9986                                       Register in_out,
9987                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
9988                                       XMMRegister w_xtmp2,
9989                                       Register tmp1,
9990                                       Register n_tmp2, Register n_tmp3) {
9991   if (is_pclmulqdq_supported) {
9992     movdl(w_xtmp1, in_out);
9993 
9994     movl(tmp1, const_or_pre_comp_const_index);
9995     movdl(w_xtmp2, tmp1);
9996     pclmulqdq(w_xtmp1, w_xtmp2, 0);
9997     // Keep result in XMM since GPR is 32 bit in length
9998   } else {
9999     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
10000   }
10001 }
10002 
10003 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
10004                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10005                                      Register tmp1, Register tmp2,
10006                                      Register n_tmp3) {
10007   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
10008   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
10009 
10010   psllq(w_xtmp1, 1);
10011   movdl(tmp1, w_xtmp1);
10012   psrlq(w_xtmp1, 32);
10013   movdl(in_out, w_xtmp1);
10014 
10015   xorl(tmp2, tmp2);
10016   crc32(tmp2, tmp1, 4);
10017   xorl(in_out, tmp2);
10018 
10019   psllq(w_xtmp2, 1);
10020   movdl(tmp1, w_xtmp2);
10021   psrlq(w_xtmp2, 32);
10022   movdl(in1, w_xtmp2);
10023 
10024   xorl(tmp2, tmp2);
10025   crc32(tmp2, tmp1, 4);
10026   xorl(in1, tmp2);
10027   xorl(in_out, in1);
10028   xorl(in_out, in2);
10029 }
10030 
10031 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
10032                                        Register in_out1, Register in_out2, Register in_out3,
10033                                        Register tmp1, Register tmp2, Register tmp3,
10034                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10035                                        Register tmp4, Register tmp5,
10036                                        Register n_tmp6) {
10037   Label L_processPartitions;
10038   Label L_processPartition;
10039   Label L_exit;
10040 
10041   bind(L_processPartitions);
10042   cmpl(in_out1, 3 * size);
10043   jcc(Assembler::less, L_exit);
10044     xorl(tmp1, tmp1);
10045     xorl(tmp2, tmp2);
10046     movl(tmp3, in_out2);
10047     addl(tmp3, size);
10048 
10049     bind(L_processPartition);
10050       crc32(in_out3, Address(in_out2, 0), 4);
10051       crc32(tmp1, Address(in_out2, size), 4);
10052       crc32(tmp2, Address(in_out2, size*2), 4);
10053       crc32(in_out3, Address(in_out2, 0+4), 4);
10054       crc32(tmp1, Address(in_out2, size+4), 4);
10055       crc32(tmp2, Address(in_out2, size*2+4), 4);
10056       addl(in_out2, 8);
10057       cmpl(in_out2, tmp3);
10058       jcc(Assembler::less, L_processPartition);
10059 
10060         push(tmp3);
10061         push(in_out1);
10062         push(in_out2);
10063         tmp4 = tmp3;
10064         tmp5 = in_out1;
10065         n_tmp6 = in_out2;
10066 
10067       crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
10068             w_xtmp1, w_xtmp2, w_xtmp3,
10069             tmp4, tmp5,
10070             n_tmp6);
10071 
10072         pop(in_out2);
10073         pop(in_out1);
10074         pop(tmp3);
10075 
10076     addl(in_out2, 2 * size);
10077     subl(in_out1, 3 * size);
10078     jmp(L_processPartitions);
10079 
10080   bind(L_exit);
10081 }
10082 #endif //LP64
10083 
10084 #ifdef _LP64
10085 // Algorithm 2: Pipelined usage of the CRC32 instruction.
10086 // Input: A buffer I of L bytes.
10087 // Output: the CRC32C value of the buffer.
10088 // Notations:
10089 // Write L = 24N + r, with N = floor (L/24).
10090 // r = L mod 24 (0 <= r < 24).
10091 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
10092 // N quadwords, and R consists of r bytes.
10093 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
10094 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
10095 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
10096 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
10097 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
10098                                           Register tmp1, Register tmp2, Register tmp3,
10099                                           Register tmp4, Register tmp5, Register tmp6,
10100                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10101                                           bool is_pclmulqdq_supported) {
10102   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
10103   Label L_wordByWord;
10104   Label L_byteByByteProlog;
10105   Label L_byteByByte;
10106   Label L_exit;
10107 
10108   if (is_pclmulqdq_supported ) {
10109     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
10110     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
10111 
10112     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
10113     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
10114 
10115     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
10116     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
10117     assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
10118   } else {
10119     const_or_pre_comp_const_index[0] = 1;
10120     const_or_pre_comp_const_index[1] = 0;
10121 
10122     const_or_pre_comp_const_index[2] = 3;
10123     const_or_pre_comp_const_index[3] = 2;
10124 
10125     const_or_pre_comp_const_index[4] = 5;
10126     const_or_pre_comp_const_index[5] = 4;
10127    }
10128   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
10129                     in2, in1, in_out,
10130                     tmp1, tmp2, tmp3,
10131                     w_xtmp1, w_xtmp2, w_xtmp3,
10132                     tmp4, tmp5,
10133                     tmp6);
10134   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
10135                     in2, in1, in_out,
10136                     tmp1, tmp2, tmp3,
10137                     w_xtmp1, w_xtmp2, w_xtmp3,
10138                     tmp4, tmp5,
10139                     tmp6);
10140   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
10141                     in2, in1, in_out,
10142                     tmp1, tmp2, tmp3,
10143                     w_xtmp1, w_xtmp2, w_xtmp3,
10144                     tmp4, tmp5,
10145                     tmp6);
10146   movl(tmp1, in2);
10147   andl(tmp1, 0x00000007);
10148   negl(tmp1);
10149   addl(tmp1, in2);
10150   addq(tmp1, in1);
10151 
10152   BIND(L_wordByWord);
10153   cmpq(in1, tmp1);
10154   jcc(Assembler::greaterEqual, L_byteByByteProlog);
10155     crc32(in_out, Address(in1, 0), 4);
10156     addq(in1, 4);
10157     jmp(L_wordByWord);
10158 
10159   BIND(L_byteByByteProlog);
10160   andl(in2, 0x00000007);
10161   movl(tmp2, 1);
10162 
10163   BIND(L_byteByByte);
10164   cmpl(tmp2, in2);
10165   jccb(Assembler::greater, L_exit);
10166     crc32(in_out, Address(in1, 0), 1);
10167     incq(in1);
10168     incl(tmp2);
10169     jmp(L_byteByByte);
10170 
10171   BIND(L_exit);
10172 }
10173 #else
10174 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
10175                                           Register tmp1, Register  tmp2, Register tmp3,
10176                                           Register tmp4, Register  tmp5, Register tmp6,
10177                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10178                                           bool is_pclmulqdq_supported) {
10179   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
10180   Label L_wordByWord;
10181   Label L_byteByByteProlog;
10182   Label L_byteByByte;
10183   Label L_exit;
10184 
10185   if (is_pclmulqdq_supported) {
10186     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
10187     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
10188 
10189     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
10190     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
10191 
10192     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
10193     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
10194   } else {
10195     const_or_pre_comp_const_index[0] = 1;
10196     const_or_pre_comp_const_index[1] = 0;
10197 
10198     const_or_pre_comp_const_index[2] = 3;
10199     const_or_pre_comp_const_index[3] = 2;
10200 
10201     const_or_pre_comp_const_index[4] = 5;
10202     const_or_pre_comp_const_index[5] = 4;
10203   }
10204   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
10205                     in2, in1, in_out,
10206                     tmp1, tmp2, tmp3,
10207                     w_xtmp1, w_xtmp2, w_xtmp3,
10208                     tmp4, tmp5,
10209                     tmp6);
10210   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
10211                     in2, in1, in_out,
10212                     tmp1, tmp2, tmp3,
10213                     w_xtmp1, w_xtmp2, w_xtmp3,
10214                     tmp4, tmp5,
10215                     tmp6);
10216   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
10217                     in2, in1, in_out,
10218                     tmp1, tmp2, tmp3,
10219                     w_xtmp1, w_xtmp2, w_xtmp3,
10220                     tmp4, tmp5,
10221                     tmp6);
10222   movl(tmp1, in2);
10223   andl(tmp1, 0x00000007);
10224   negl(tmp1);
10225   addl(tmp1, in2);
10226   addl(tmp1, in1);
10227 
10228   BIND(L_wordByWord);
10229   cmpl(in1, tmp1);
10230   jcc(Assembler::greaterEqual, L_byteByByteProlog);
10231     crc32(in_out, Address(in1,0), 4);
10232     addl(in1, 4);
10233     jmp(L_wordByWord);
10234 
10235   BIND(L_byteByByteProlog);
10236   andl(in2, 0x00000007);
10237   movl(tmp2, 1);
10238 
10239   BIND(L_byteByByte);
10240   cmpl(tmp2, in2);
10241   jccb(Assembler::greater, L_exit);
10242     movb(tmp1, Address(in1, 0));
10243     crc32(in_out, tmp1, 1);
10244     incl(in1);
10245     incl(tmp2);
10246     jmp(L_byteByByte);
10247 
10248   BIND(L_exit);
10249 }
10250 #endif // LP64
10251 #undef BIND
10252 #undef BLOCK_COMMENT
10253 
10254 // Compress char[] array to byte[].
10255 //   ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
10256 //   @HotSpotIntrinsicCandidate
10257 //   private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
10258 //     for (int i = 0; i < len; i++) {
10259 //       int c = src[srcOff++];
10260 //       if (c >>> 8 != 0) {
10261 //         return 0;
10262 //       }
10263 //       dst[dstOff++] = (byte)c;
10264 //     }
10265 //     return len;
10266 //   }
10267 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
10268   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
10269   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
10270   Register tmp5, Register result) {
10271   Label copy_chars_loop, return_length, return_zero, done;
10272 
10273   // rsi: src
10274   // rdi: dst
10275   // rdx: len
10276   // rcx: tmp5
10277   // rax: result
10278 
10279   // rsi holds start addr of source char[] to be compressed
10280   // rdi holds start addr of destination byte[]
10281   // rdx holds length
10282 
10283   assert(len != result, "");
10284 
10285   // save length for return
10286   push(len);
10287 
10288   if ((UseAVX > 2) && // AVX512
10289     VM_Version::supports_avx512vlbw() &&
10290     VM_Version::supports_bmi2()) {
10291 
10292     set_vector_masking();  // opening of the stub context for programming mask registers
10293 
10294     Label copy_32_loop, copy_loop_tail, restore_k1_return_zero, below_threshold;
10295 
10296     // alignment
10297     Label post_alignment;
10298 
10299     // if length of the string is less than 16, handle it in an old fashioned way
10300     testl(len, -32);
10301     jcc(Assembler::zero, below_threshold);
10302 
10303     // First check whether a character is compressable ( <= 0xFF).
10304     // Create mask to test for Unicode chars inside zmm vector
10305     movl(result, 0x00FF);
10306     evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
10307 
10308     // Save k1
10309     kmovql(k3, k1);
10310 
10311     testl(len, -64);
10312     jcc(Assembler::zero, post_alignment);
10313 
10314     movl(tmp5, dst);
10315     andl(tmp5, (32 - 1));
10316     negl(tmp5);
10317     andl(tmp5, (32 - 1));
10318 
10319     // bail out when there is nothing to be done
10320     testl(tmp5, 0xFFFFFFFF);
10321     jcc(Assembler::zero, post_alignment);
10322 
10323     // ~(~0 << len), where len is the # of remaining elements to process
10324     movl(result, 0xFFFFFFFF);
10325     shlxl(result, result, tmp5);
10326     notl(result);
10327     kmovdl(k1, result);
10328 
10329     evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit);
10330     evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
10331     ktestd(k2, k1);
10332     jcc(Assembler::carryClear, restore_k1_return_zero);
10333 
10334     evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
10335 
10336     addptr(src, tmp5);
10337     addptr(src, tmp5);
10338     addptr(dst, tmp5);
10339     subl(len, tmp5);
10340 
10341     bind(post_alignment);
10342     // end of alignment
10343 
10344     movl(tmp5, len);
10345     andl(tmp5, (32 - 1));    // tail count (in chars)
10346     andl(len, ~(32 - 1));    // vector count (in chars)
10347     jcc(Assembler::zero, copy_loop_tail);
10348 
10349     lea(src, Address(src, len, Address::times_2));
10350     lea(dst, Address(dst, len, Address::times_1));
10351     negptr(len);
10352 
10353     bind(copy_32_loop);
10354     evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
10355     evpcmpuw(k2, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
10356     kortestdl(k2, k2);
10357     jcc(Assembler::carryClear, restore_k1_return_zero);
10358 
10359     // All elements in current processed chunk are valid candidates for
10360     // compression. Write a truncated byte elements to the memory.
10361     evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
10362     addptr(len, 32);
10363     jcc(Assembler::notZero, copy_32_loop);
10364 
10365     bind(copy_loop_tail);
10366     // bail out when there is nothing to be done
10367     testl(tmp5, 0xFFFFFFFF);
10368     // Restore k1
10369     kmovql(k1, k3);
10370     jcc(Assembler::zero, return_length);
10371 
10372     movl(len, tmp5);
10373 
10374     // ~(~0 << len), where len is the # of remaining elements to process
10375     movl(result, 0xFFFFFFFF);
10376     shlxl(result, result, len);
10377     notl(result);
10378 
10379     kmovdl(k1, result);
10380 
10381     evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit);
10382     evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
10383     ktestd(k2, k1);
10384     jcc(Assembler::carryClear, restore_k1_return_zero);
10385 
10386     evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
10387     // Restore k1
10388     kmovql(k1, k3);
10389     jmp(return_length);
10390 
10391     bind(restore_k1_return_zero);
10392     // Restore k1
10393     kmovql(k1, k3);
10394     jmp(return_zero);
10395 
10396     clear_vector_masking();   // closing of the stub context for programming mask registers
10397 
10398     bind(below_threshold);
10399   }
10400 
10401   if (UseSSE42Intrinsics) {
10402     Label copy_32_loop, copy_16, copy_tail;
10403 
10404     movl(result, len);
10405 
10406     movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
10407 
10408     // vectored compression
10409     andl(len, 0xfffffff0);    // vector count (in chars)
10410     andl(result, 0x0000000f);    // tail count (in chars)
10411     testl(len, len);
10412     jccb(Assembler::zero, copy_16);
10413 
10414     // compress 16 chars per iter
10415     movdl(tmp1Reg, tmp5);
10416     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
10417     pxor(tmp4Reg, tmp4Reg);
10418 
10419     lea(src, Address(src, len, Address::times_2));
10420     lea(dst, Address(dst, len, Address::times_1));
10421     negptr(len);
10422 
10423     bind(copy_32_loop);
10424     movdqu(tmp2Reg, Address(src, len, Address::times_2));     // load 1st 8 characters
10425     por(tmp4Reg, tmp2Reg);
10426     movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
10427     por(tmp4Reg, tmp3Reg);
10428     ptest(tmp4Reg, tmp1Reg);       // check for Unicode chars in next vector
10429     jcc(Assembler::notZero, return_zero);
10430     packuswb(tmp2Reg, tmp3Reg);    // only ASCII chars; compress each to 1 byte
10431     movdqu(Address(dst, len, Address::times_1), tmp2Reg);
10432     addptr(len, 16);
10433     jcc(Assembler::notZero, copy_32_loop);
10434 
10435     // compress next vector of 8 chars (if any)
10436     bind(copy_16);
10437     movl(len, result);
10438     andl(len, 0xfffffff8);    // vector count (in chars)
10439     andl(result, 0x00000007);    // tail count (in chars)
10440     testl(len, len);
10441     jccb(Assembler::zero, copy_tail);
10442 
10443     movdl(tmp1Reg, tmp5);
10444     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
10445     pxor(tmp3Reg, tmp3Reg);
10446 
10447     movdqu(tmp2Reg, Address(src, 0));
10448     ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in vector
10449     jccb(Assembler::notZero, return_zero);
10450     packuswb(tmp2Reg, tmp3Reg);    // only LATIN1 chars; compress each to 1 byte
10451     movq(Address(dst, 0), tmp2Reg);
10452     addptr(src, 16);
10453     addptr(dst, 8);
10454 
10455     bind(copy_tail);
10456     movl(len, result);
10457   }
10458   // compress 1 char per iter
10459   testl(len, len);
10460   jccb(Assembler::zero, return_length);
10461   lea(src, Address(src, len, Address::times_2));
10462   lea(dst, Address(dst, len, Address::times_1));
10463   negptr(len);
10464 
10465   bind(copy_chars_loop);
10466   load_unsigned_short(result, Address(src, len, Address::times_2));
10467   testl(result, 0xff00);      // check if Unicode char
10468   jccb(Assembler::notZero, return_zero);
10469   movb(Address(dst, len, Address::times_1), result);  // ASCII char; compress to 1 byte
10470   increment(len);
10471   jcc(Assembler::notZero, copy_chars_loop);
10472 
10473   // if compression succeeded, return length
10474   bind(return_length);
10475   pop(result);
10476   jmpb(done);
10477 
10478   // if compression failed, return 0
10479   bind(return_zero);
10480   xorl(result, result);
10481   addptr(rsp, wordSize);
10482 
10483   bind(done);
10484 }
10485 
10486 // Inflate byte[] array to char[].
10487 //   ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
10488 //   @HotSpotIntrinsicCandidate
10489 //   private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
10490 //     for (int i = 0; i < len; i++) {
10491 //       dst[dstOff++] = (char)(src[srcOff++] & 0xff);
10492 //     }
10493 //   }
10494 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
10495   XMMRegister tmp1, Register tmp2) {
10496   Label copy_chars_loop, done, below_threshold;
10497   // rsi: src
10498   // rdi: dst
10499   // rdx: len
10500   // rcx: tmp2
10501 
10502   // rsi holds start addr of source byte[] to be inflated
10503   // rdi holds start addr of destination char[]
10504   // rdx holds length
10505   assert_different_registers(src, dst, len, tmp2);
10506 
10507   if ((UseAVX > 2) && // AVX512
10508     VM_Version::supports_avx512vlbw() &&
10509     VM_Version::supports_bmi2()) {
10510 
10511     set_vector_masking();  // opening of the stub context for programming mask registers
10512 
10513     Label copy_32_loop, copy_tail;
10514     Register tmp3_aliased = len;
10515 
10516     // if length of the string is less than 16, handle it in an old fashioned way
10517     testl(len, -16);
10518     jcc(Assembler::zero, below_threshold);
10519 
10520     // In order to use only one arithmetic operation for the main loop we use
10521     // this pre-calculation
10522     movl(tmp2, len);
10523     andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
10524     andl(len, -32);     // vector count
10525     jccb(Assembler::zero, copy_tail);
10526 
10527     lea(src, Address(src, len, Address::times_1));
10528     lea(dst, Address(dst, len, Address::times_2));
10529     negptr(len);
10530 
10531 
10532     // inflate 32 chars per iter
10533     bind(copy_32_loop);
10534     vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
10535     evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
10536     addptr(len, 32);
10537     jcc(Assembler::notZero, copy_32_loop);
10538 
10539     bind(copy_tail);
10540     // bail out when there is nothing to be done
10541     testl(tmp2, -1); // we don't destroy the contents of tmp2 here
10542     jcc(Assembler::zero, done);
10543 
10544     // Save k1
10545     kmovql(k2, k1);
10546 
10547     // ~(~0 << length), where length is the # of remaining elements to process
10548     movl(tmp3_aliased, -1);
10549     shlxl(tmp3_aliased, tmp3_aliased, tmp2);
10550     notl(tmp3_aliased);
10551     kmovdl(k1, tmp3_aliased);
10552     evpmovzxbw(tmp1, k1, Address(src, 0), Assembler::AVX_512bit);
10553     evmovdquw(Address(dst, 0), k1, tmp1, Assembler::AVX_512bit);
10554 
10555     // Restore k1
10556     kmovql(k1, k2);
10557     jmp(done);
10558 
10559     clear_vector_masking();   // closing of the stub context for programming mask registers
10560   }
10561   if (UseSSE42Intrinsics) {
10562     Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
10563 
10564     movl(tmp2, len);
10565 
10566     if (UseAVX > 1) {
10567       andl(tmp2, (16 - 1));
10568       andl(len, -16);
10569       jccb(Assembler::zero, copy_new_tail);
10570     } else {
10571       andl(tmp2, 0x00000007);   // tail count (in chars)
10572       andl(len, 0xfffffff8);    // vector count (in chars)
10573       jccb(Assembler::zero, copy_tail);
10574     }
10575 
10576     // vectored inflation
10577     lea(src, Address(src, len, Address::times_1));
10578     lea(dst, Address(dst, len, Address::times_2));
10579     negptr(len);
10580 
10581     if (UseAVX > 1) {
10582       bind(copy_16_loop);
10583       vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
10584       vmovdqu(Address(dst, len, Address::times_2), tmp1);
10585       addptr(len, 16);
10586       jcc(Assembler::notZero, copy_16_loop);
10587 
10588       bind(below_threshold);
10589       bind(copy_new_tail);
10590       if ((UseAVX > 2) &&
10591         VM_Version::supports_avx512vlbw() &&
10592         VM_Version::supports_bmi2()) {
10593         movl(tmp2, len);
10594       } else {
10595         movl(len, tmp2);
10596       }
10597       andl(tmp2, 0x00000007);
10598       andl(len, 0xFFFFFFF8);
10599       jccb(Assembler::zero, copy_tail);
10600 
10601       pmovzxbw(tmp1, Address(src, 0));
10602       movdqu(Address(dst, 0), tmp1);
10603       addptr(src, 8);
10604       addptr(dst, 2 * 8);
10605 
10606       jmp(copy_tail, true);
10607     }
10608 
10609     // inflate 8 chars per iter
10610     bind(copy_8_loop);
10611     pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
10612     movdqu(Address(dst, len, Address::times_2), tmp1);
10613     addptr(len, 8);
10614     jcc(Assembler::notZero, copy_8_loop);
10615 
10616     bind(copy_tail);
10617     movl(len, tmp2);
10618 
10619     cmpl(len, 4);
10620     jccb(Assembler::less, copy_bytes);
10621 
10622     movdl(tmp1, Address(src, 0));  // load 4 byte chars
10623     pmovzxbw(tmp1, tmp1);
10624     movq(Address(dst, 0), tmp1);
10625     subptr(len, 4);
10626     addptr(src, 4);
10627     addptr(dst, 8);
10628 
10629     bind(copy_bytes);
10630   } else {
10631     bind(below_threshold);
10632   }
10633 
10634   testl(len, len);
10635   jccb(Assembler::zero, done);
10636   lea(src, Address(src, len, Address::times_1));
10637   lea(dst, Address(dst, len, Address::times_2));
10638   negptr(len);
10639 
10640   // inflate 1 char per iter
10641   bind(copy_chars_loop);
10642   load_unsigned_byte(tmp2, Address(src, len, Address::times_1));  // load byte char
10643   movw(Address(dst, len, Address::times_2), tmp2);  // inflate byte char to word
10644   increment(len);
10645   jcc(Assembler::notZero, copy_chars_loop);
10646 
10647   bind(done);
10648 }
10649 
10650 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
10651   switch (cond) {
10652     // Note some conditions are synonyms for others
10653     case Assembler::zero:         return Assembler::notZero;
10654     case Assembler::notZero:      return Assembler::zero;
10655     case Assembler::less:         return Assembler::greaterEqual;
10656     case Assembler::lessEqual:    return Assembler::greater;
10657     case Assembler::greater:      return Assembler::lessEqual;
10658     case Assembler::greaterEqual: return Assembler::less;
10659     case Assembler::below:        return Assembler::aboveEqual;
10660     case Assembler::belowEqual:   return Assembler::above;
10661     case Assembler::above:        return Assembler::belowEqual;
10662     case Assembler::aboveEqual:   return Assembler::below;
10663     case Assembler::overflow:     return Assembler::noOverflow;
10664     case Assembler::noOverflow:   return Assembler::overflow;
10665     case Assembler::negative:     return Assembler::positive;
10666     case Assembler::positive:     return Assembler::negative;
10667     case Assembler::parity:       return Assembler::noParity;
10668     case Assembler::noParity:     return Assembler::parity;
10669   }
10670   ShouldNotReachHere(); return Assembler::overflow;
10671 }
10672 
10673 SkipIfEqual::SkipIfEqual(
10674     MacroAssembler* masm, const bool* flag_addr, bool value) {
10675   _masm = masm;
10676   _masm->cmp8(ExternalAddress((address)flag_addr), value);
10677   _masm->jcc(Assembler::equal, _label);
10678 }
10679 
10680 SkipIfEqual::~SkipIfEqual() {
10681   _masm->bind(_label);
10682 }
10683 
10684 // 32-bit Windows has its own fast-path implementation
10685 // of get_thread
10686 #if !defined(WIN32) || defined(_LP64)
10687 
10688 // This is simply a call to Thread::current()
10689 void MacroAssembler::get_thread(Register thread) {
10690   if (thread != rax) {
10691     push(rax);
10692   }
10693   LP64_ONLY(push(rdi);)
10694   LP64_ONLY(push(rsi);)
10695   push(rdx);
10696   push(rcx);
10697 #ifdef _LP64
10698   push(r8);
10699   push(r9);
10700   push(r10);
10701   push(r11);
10702 #endif
10703 
10704   MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
10705 
10706 #ifdef _LP64
10707   pop(r11);
10708   pop(r10);
10709   pop(r9);
10710   pop(r8);
10711 #endif
10712   pop(rcx);
10713   pop(rdx);
10714   LP64_ONLY(pop(rsi);)
10715   LP64_ONLY(pop(rdi);)
10716   if (thread != rax) {
10717     mov(thread, rax);
10718     pop(rax);
10719   }
10720 }
10721 
10722 #endif