1 /*
   2  * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "jvm.h"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "compiler/disassembler.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "gc/shared/collectedHeap.inline.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 #include "memory/resourceArea.hpp"
  35 #include "memory/universe.hpp"
  36 #include "oops/accessDecorators.hpp"
  37 #include "oops/compressedOops.inline.hpp"
  38 #include "oops/klass.inline.hpp"
  39 #include "prims/methodHandles.hpp"
  40 #include "runtime/biasedLocking.hpp"
  41 #include "runtime/flags/flagSetting.hpp"
  42 #include "runtime/interfaceSupport.inline.hpp"
  43 #include "runtime/objectMonitor.hpp"
  44 #include "runtime/os.hpp"
  45 #include "runtime/safepoint.hpp"
  46 #include "runtime/safepointMechanism.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/stubRoutines.hpp"
  49 #include "runtime/thread.hpp"
  50 #include "utilities/macros.hpp"
  51 #include "crc32c.h"
  52 #ifdef COMPILER2
  53 #include "opto/intrinsicnode.hpp"
  54 #endif
  55 
  56 #ifdef PRODUCT
  57 #define BLOCK_COMMENT(str) /* nothing */
  58 #define STOP(error) stop(error)
  59 #else
  60 #define BLOCK_COMMENT(str) block_comment(str)
  61 #define STOP(error) block_comment(error); stop(error)
  62 #endif
  63 
  64 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  65 
  66 #ifdef ASSERT
  67 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
  68 #endif
  69 
  70 static Assembler::Condition reverse[] = {
  71     Assembler::noOverflow     /* overflow      = 0x0 */ ,
  72     Assembler::overflow       /* noOverflow    = 0x1 */ ,
  73     Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
  74     Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
  75     Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
  76     Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
  77     Assembler::above          /* belowEqual    = 0x6 */ ,
  78     Assembler::belowEqual     /* above         = 0x7 */ ,
  79     Assembler::positive       /* negative      = 0x8 */ ,
  80     Assembler::negative       /* positive      = 0x9 */ ,
  81     Assembler::noParity       /* parity        = 0xa */ ,
  82     Assembler::parity         /* noParity      = 0xb */ ,
  83     Assembler::greaterEqual   /* less          = 0xc */ ,
  84     Assembler::less           /* greaterEqual  = 0xd */ ,
  85     Assembler::greater        /* lessEqual     = 0xe */ ,
  86     Assembler::lessEqual      /* greater       = 0xf, */
  87 
  88 };
  89 
  90 
  91 // Implementation of MacroAssembler
  92 
  93 // First all the versions that have distinct versions depending on 32/64 bit
  94 // Unless the difference is trivial (1 line or so).
  95 
  96 #ifndef _LP64
  97 
  98 // 32bit versions
  99 
 100 Address MacroAssembler::as_Address(AddressLiteral adr) {
 101   return Address(adr.target(), adr.rspec());
 102 }
 103 
 104 Address MacroAssembler::as_Address(ArrayAddress adr) {
 105   return Address::make_array(adr);
 106 }
 107 
 108 void MacroAssembler::call_VM_leaf_base(address entry_point,
 109                                        int number_of_arguments) {
 110   call(RuntimeAddress(entry_point));
 111   increment(rsp, number_of_arguments * wordSize);
 112 }
 113 
 114 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
 115   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 116 }
 117 
 118 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
 119   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 120 }
 121 
 122 void MacroAssembler::cmpoop_raw(Address src1, jobject obj) {
 123   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 124 }
 125 
 126 void MacroAssembler::cmpoop_raw(Register src1, jobject obj) {
 127   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 128 }
 129 
 130 void MacroAssembler::cmpoop(Address src1, jobject obj) {
 131   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 132   bs->obj_equals(this, src1, obj);
 133 }
 134 
 135 void MacroAssembler::cmpoop(Register src1, jobject obj) {
 136   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 137   bs->obj_equals(this, src1, obj);
 138 }
 139 
 140 void MacroAssembler::extend_sign(Register hi, Register lo) {
 141   // According to Intel Doc. AP-526, "Integer Divide", p.18.
 142   if (VM_Version::is_P6() && hi == rdx && lo == rax) {
 143     cdql();
 144   } else {
 145     movl(hi, lo);
 146     sarl(hi, 31);
 147   }
 148 }
 149 
 150 void MacroAssembler::jC2(Register tmp, Label& L) {
 151   // set parity bit if FPU flag C2 is set (via rax)
 152   save_rax(tmp);
 153   fwait(); fnstsw_ax();
 154   sahf();
 155   restore_rax(tmp);
 156   // branch
 157   jcc(Assembler::parity, L);
 158 }
 159 
 160 void MacroAssembler::jnC2(Register tmp, Label& L) {
 161   // set parity bit if FPU flag C2 is set (via rax)
 162   save_rax(tmp);
 163   fwait(); fnstsw_ax();
 164   sahf();
 165   restore_rax(tmp);
 166   // branch
 167   jcc(Assembler::noParity, L);
 168 }
 169 
 170 // 32bit can do a case table jump in one instruction but we no longer allow the base
 171 // to be installed in the Address class
 172 void MacroAssembler::jump(ArrayAddress entry) {
 173   jmp(as_Address(entry));
 174 }
 175 
 176 // Note: y_lo will be destroyed
 177 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 178   // Long compare for Java (semantics as described in JVM spec.)
 179   Label high, low, done;
 180 
 181   cmpl(x_hi, y_hi);
 182   jcc(Assembler::less, low);
 183   jcc(Assembler::greater, high);
 184   // x_hi is the return register
 185   xorl(x_hi, x_hi);
 186   cmpl(x_lo, y_lo);
 187   jcc(Assembler::below, low);
 188   jcc(Assembler::equal, done);
 189 
 190   bind(high);
 191   xorl(x_hi, x_hi);
 192   increment(x_hi);
 193   jmp(done);
 194 
 195   bind(low);
 196   xorl(x_hi, x_hi);
 197   decrementl(x_hi);
 198 
 199   bind(done);
 200 }
 201 
 202 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 203     mov_literal32(dst, (int32_t)src.target(), src.rspec());
 204 }
 205 
 206 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 207   // leal(dst, as_Address(adr));
 208   // see note in movl as to why we must use a move
 209   mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
 210 }
 211 
 212 void MacroAssembler::leave() {
 213   mov(rsp, rbp);
 214   pop(rbp);
 215 }
 216 
 217 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
 218   // Multiplication of two Java long values stored on the stack
 219   // as illustrated below. Result is in rdx:rax.
 220   //
 221   // rsp ---> [  ??  ] \               \
 222   //            ....    | y_rsp_offset  |
 223   //          [ y_lo ] /  (in bytes)    | x_rsp_offset
 224   //          [ y_hi ]                  | (in bytes)
 225   //            ....                    |
 226   //          [ x_lo ]                 /
 227   //          [ x_hi ]
 228   //            ....
 229   //
 230   // Basic idea: lo(result) = lo(x_lo * y_lo)
 231   //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
 232   Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
 233   Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
 234   Label quick;
 235   // load x_hi, y_hi and check if quick
 236   // multiplication is possible
 237   movl(rbx, x_hi);
 238   movl(rcx, y_hi);
 239   movl(rax, rbx);
 240   orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
 241   jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
 242   // do full multiplication
 243   // 1st step
 244   mull(y_lo);                                    // x_hi * y_lo
 245   movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
 246   // 2nd step
 247   movl(rax, x_lo);
 248   mull(rcx);                                     // x_lo * y_hi
 249   addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
 250   // 3rd step
 251   bind(quick);                                   // note: rbx, = 0 if quick multiply!
 252   movl(rax, x_lo);
 253   mull(y_lo);                                    // x_lo * y_lo
 254   addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
 255 }
 256 
 257 void MacroAssembler::lneg(Register hi, Register lo) {
 258   negl(lo);
 259   adcl(hi, 0);
 260   negl(hi);
 261 }
 262 
 263 void MacroAssembler::lshl(Register hi, Register lo) {
 264   // Java shift left long support (semantics as described in JVM spec., p.305)
 265   // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
 266   // shift value is in rcx !
 267   assert(hi != rcx, "must not use rcx");
 268   assert(lo != rcx, "must not use rcx");
 269   const Register s = rcx;                        // shift count
 270   const int      n = BitsPerWord;
 271   Label L;
 272   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 273   cmpl(s, n);                                    // if (s < n)
 274   jcc(Assembler::less, L);                       // else (s >= n)
 275   movl(hi, lo);                                  // x := x << n
 276   xorl(lo, lo);
 277   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 278   bind(L);                                       // s (mod n) < n
 279   shldl(hi, lo);                                 // x := x << s
 280   shll(lo);
 281 }
 282 
 283 
 284 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
 285   // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
 286   // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
 287   assert(hi != rcx, "must not use rcx");
 288   assert(lo != rcx, "must not use rcx");
 289   const Register s = rcx;                        // shift count
 290   const int      n = BitsPerWord;
 291   Label L;
 292   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 293   cmpl(s, n);                                    // if (s < n)
 294   jcc(Assembler::less, L);                       // else (s >= n)
 295   movl(lo, hi);                                  // x := x >> n
 296   if (sign_extension) sarl(hi, 31);
 297   else                xorl(hi, hi);
 298   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 299   bind(L);                                       // s (mod n) < n
 300   shrdl(lo, hi);                                 // x := x >> s
 301   if (sign_extension) sarl(hi);
 302   else                shrl(hi);
 303 }
 304 
 305 void MacroAssembler::movoop(Register dst, jobject obj) {
 306   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 307 }
 308 
 309 void MacroAssembler::movoop(Address dst, jobject obj) {
 310   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 311 }
 312 
 313 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 314   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 315 }
 316 
 317 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 318   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 319 }
 320 
 321 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 322   // scratch register is not used,
 323   // it is defined to match parameters of 64-bit version of this method.
 324   if (src.is_lval()) {
 325     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
 326   } else {
 327     movl(dst, as_Address(src));
 328   }
 329 }
 330 
 331 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 332   movl(as_Address(dst), src);
 333 }
 334 
 335 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 336   movl(dst, as_Address(src));
 337 }
 338 
 339 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 340 void MacroAssembler::movptr(Address dst, intptr_t src) {
 341   movl(dst, src);
 342 }
 343 
 344 
 345 void MacroAssembler::pop_callee_saved_registers() {
 346   pop(rcx);
 347   pop(rdx);
 348   pop(rdi);
 349   pop(rsi);
 350 }
 351 
 352 void MacroAssembler::pop_fTOS() {
 353   fld_d(Address(rsp, 0));
 354   addl(rsp, 2 * wordSize);
 355 }
 356 
 357 void MacroAssembler::push_callee_saved_registers() {
 358   push(rsi);
 359   push(rdi);
 360   push(rdx);
 361   push(rcx);
 362 }
 363 
 364 void MacroAssembler::push_fTOS() {
 365   subl(rsp, 2 * wordSize);
 366   fstp_d(Address(rsp, 0));
 367 }
 368 
 369 
 370 void MacroAssembler::pushoop(jobject obj) {
 371   push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
 372 }
 373 
 374 void MacroAssembler::pushklass(Metadata* obj) {
 375   push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
 376 }
 377 
 378 void MacroAssembler::pushptr(AddressLiteral src) {
 379   if (src.is_lval()) {
 380     push_literal32((int32_t)src.target(), src.rspec());
 381   } else {
 382     pushl(as_Address(src));
 383   }
 384 }
 385 
 386 void MacroAssembler::set_word_if_not_zero(Register dst) {
 387   xorl(dst, dst);
 388   set_byte_if_not_zero(dst);
 389 }
 390 
 391 static void pass_arg0(MacroAssembler* masm, Register arg) {
 392   masm->push(arg);
 393 }
 394 
 395 static void pass_arg1(MacroAssembler* masm, Register arg) {
 396   masm->push(arg);
 397 }
 398 
 399 static void pass_arg2(MacroAssembler* masm, Register arg) {
 400   masm->push(arg);
 401 }
 402 
 403 static void pass_arg3(MacroAssembler* masm, Register arg) {
 404   masm->push(arg);
 405 }
 406 
 407 #ifndef PRODUCT
 408 extern "C" void findpc(intptr_t x);
 409 #endif
 410 
 411 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
 412   // In order to get locks to work, we need to fake a in_VM state
 413   JavaThread* thread = JavaThread::current();
 414   JavaThreadState saved_state = thread->thread_state();
 415   thread->set_thread_state(_thread_in_vm);
 416   if (ShowMessageBoxOnError) {
 417     JavaThread* thread = JavaThread::current();
 418     JavaThreadState saved_state = thread->thread_state();
 419     thread->set_thread_state(_thread_in_vm);
 420     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 421       ttyLocker ttyl;
 422       BytecodeCounter::print();
 423     }
 424     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 425     // This is the value of eip which points to where verify_oop will return.
 426     if (os::message_box(msg, "Execution stopped, print registers?")) {
 427       print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
 428       BREAKPOINT;
 429     }
 430   } else {
 431     ttyLocker ttyl;
 432     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
 433   }
 434   // Don't assert holding the ttyLock
 435     assert(false, "DEBUG MESSAGE: %s", msg);
 436   ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
 437 }
 438 
 439 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
 440   ttyLocker ttyl;
 441   FlagSetting fs(Debugging, true);
 442   tty->print_cr("eip = 0x%08x", eip);
 443 #ifndef PRODUCT
 444   if ((WizardMode || Verbose) && PrintMiscellaneous) {
 445     tty->cr();
 446     findpc(eip);
 447     tty->cr();
 448   }
 449 #endif
 450 #define PRINT_REG(rax) \
 451   { tty->print("%s = ", #rax); os::print_location(tty, rax); }
 452   PRINT_REG(rax);
 453   PRINT_REG(rbx);
 454   PRINT_REG(rcx);
 455   PRINT_REG(rdx);
 456   PRINT_REG(rdi);
 457   PRINT_REG(rsi);
 458   PRINT_REG(rbp);
 459   PRINT_REG(rsp);
 460 #undef PRINT_REG
 461   // Print some words near top of staack.
 462   int* dump_sp = (int*) rsp;
 463   for (int col1 = 0; col1 < 8; col1++) {
 464     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 465     os::print_location(tty, *dump_sp++);
 466   }
 467   for (int row = 0; row < 16; row++) {
 468     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 469     for (int col = 0; col < 8; col++) {
 470       tty->print(" 0x%08x", *dump_sp++);
 471     }
 472     tty->cr();
 473   }
 474   // Print some instructions around pc:
 475   Disassembler::decode((address)eip-64, (address)eip);
 476   tty->print_cr("--------");
 477   Disassembler::decode((address)eip, (address)eip+32);
 478 }
 479 
 480 void MacroAssembler::stop(const char* msg) {
 481   ExternalAddress message((address)msg);
 482   // push address of message
 483   pushptr(message.addr());
 484   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 485   pusha();                                            // push registers
 486   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
 487   hlt();
 488 }
 489 
 490 void MacroAssembler::warn(const char* msg) {
 491   push_CPU_state();
 492 
 493   ExternalAddress message((address) msg);
 494   // push address of message
 495   pushptr(message.addr());
 496 
 497   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
 498   addl(rsp, wordSize);       // discard argument
 499   pop_CPU_state();
 500 }
 501 
 502 void MacroAssembler::print_state() {
 503   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 504   pusha();                                            // push registers
 505 
 506   push_CPU_state();
 507   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
 508   pop_CPU_state();
 509 
 510   popa();
 511   addl(rsp, wordSize);
 512 }
 513 
 514 #else // _LP64
 515 
 516 // 64 bit versions
 517 
 518 Address MacroAssembler::as_Address(AddressLiteral adr) {
 519   // amd64 always does this as a pc-rel
 520   // we can be absolute or disp based on the instruction type
 521   // jmp/call are displacements others are absolute
 522   assert(!adr.is_lval(), "must be rval");
 523   assert(reachable(adr), "must be");
 524   return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
 525 
 526 }
 527 
 528 Address MacroAssembler::as_Address(ArrayAddress adr) {
 529   AddressLiteral base = adr.base();
 530   lea(rscratch1, base);
 531   Address index = adr.index();
 532   assert(index._disp == 0, "must not have disp"); // maybe it can?
 533   Address array(rscratch1, index._index, index._scale, index._disp);
 534   return array;
 535 }
 536 
 537 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
 538   Label L, E;
 539 
 540 #ifdef _WIN64
 541   // Windows always allocates space for it's register args
 542   assert(num_args <= 4, "only register arguments supported");
 543   subq(rsp,  frame::arg_reg_save_area_bytes);
 544 #endif
 545 
 546   // Align stack if necessary
 547   testl(rsp, 15);
 548   jcc(Assembler::zero, L);
 549 
 550   subq(rsp, 8);
 551   {
 552     call(RuntimeAddress(entry_point));
 553   }
 554   addq(rsp, 8);
 555   jmp(E);
 556 
 557   bind(L);
 558   {
 559     call(RuntimeAddress(entry_point));
 560   }
 561 
 562   bind(E);
 563 
 564 #ifdef _WIN64
 565   // restore stack pointer
 566   addq(rsp, frame::arg_reg_save_area_bytes);
 567 #endif
 568 
 569 }
 570 
 571 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
 572   assert(!src2.is_lval(), "should use cmpptr");
 573 
 574   if (reachable(src2)) {
 575     cmpq(src1, as_Address(src2));
 576   } else {
 577     lea(rscratch1, src2);
 578     Assembler::cmpq(src1, Address(rscratch1, 0));
 579   }
 580 }
 581 
 582 int MacroAssembler::corrected_idivq(Register reg) {
 583   // Full implementation of Java ldiv and lrem; checks for special
 584   // case as described in JVM spec., p.243 & p.271.  The function
 585   // returns the (pc) offset of the idivl instruction - may be needed
 586   // for implicit exceptions.
 587   //
 588   //         normal case                           special case
 589   //
 590   // input : rax: dividend                         min_long
 591   //         reg: divisor   (may not be eax/edx)   -1
 592   //
 593   // output: rax: quotient  (= rax idiv reg)       min_long
 594   //         rdx: remainder (= rax irem reg)       0
 595   assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
 596   static const int64_t min_long = 0x8000000000000000;
 597   Label normal_case, special_case;
 598 
 599   // check for special case
 600   cmp64(rax, ExternalAddress((address) &min_long));
 601   jcc(Assembler::notEqual, normal_case);
 602   xorl(rdx, rdx); // prepare rdx for possible special case (where
 603                   // remainder = 0)
 604   cmpq(reg, -1);
 605   jcc(Assembler::equal, special_case);
 606 
 607   // handle normal case
 608   bind(normal_case);
 609   cdqq();
 610   int idivq_offset = offset();
 611   idivq(reg);
 612 
 613   // normal and special case exit
 614   bind(special_case);
 615 
 616   return idivq_offset;
 617 }
 618 
 619 void MacroAssembler::decrementq(Register reg, int value) {
 620   if (value == min_jint) { subq(reg, value); return; }
 621   if (value <  0) { incrementq(reg, -value); return; }
 622   if (value == 0) {                        ; return; }
 623   if (value == 1 && UseIncDec) { decq(reg) ; return; }
 624   /* else */      { subq(reg, value)       ; return; }
 625 }
 626 
 627 void MacroAssembler::decrementq(Address dst, int value) {
 628   if (value == min_jint) { subq(dst, value); return; }
 629   if (value <  0) { incrementq(dst, -value); return; }
 630   if (value == 0) {                        ; return; }
 631   if (value == 1 && UseIncDec) { decq(dst) ; return; }
 632   /* else */      { subq(dst, value)       ; return; }
 633 }
 634 
 635 void MacroAssembler::incrementq(AddressLiteral dst) {
 636   if (reachable(dst)) {
 637     incrementq(as_Address(dst));
 638   } else {
 639     lea(rscratch1, dst);
 640     incrementq(Address(rscratch1, 0));
 641   }
 642 }
 643 
 644 void MacroAssembler::incrementq(Register reg, int value) {
 645   if (value == min_jint) { addq(reg, value); return; }
 646   if (value <  0) { decrementq(reg, -value); return; }
 647   if (value == 0) {                        ; return; }
 648   if (value == 1 && UseIncDec) { incq(reg) ; return; }
 649   /* else */      { addq(reg, value)       ; return; }
 650 }
 651 
 652 void MacroAssembler::incrementq(Address dst, int value) {
 653   if (value == min_jint) { addq(dst, value); return; }
 654   if (value <  0) { decrementq(dst, -value); return; }
 655   if (value == 0) {                        ; return; }
 656   if (value == 1 && UseIncDec) { incq(dst) ; return; }
 657   /* else */      { addq(dst, value)       ; return; }
 658 }
 659 
 660 // 32bit can do a case table jump in one instruction but we no longer allow the base
 661 // to be installed in the Address class
 662 void MacroAssembler::jump(ArrayAddress entry) {
 663   lea(rscratch1, entry.base());
 664   Address dispatch = entry.index();
 665   assert(dispatch._base == noreg, "must be");
 666   dispatch._base = rscratch1;
 667   jmp(dispatch);
 668 }
 669 
 670 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 671   ShouldNotReachHere(); // 64bit doesn't use two regs
 672   cmpq(x_lo, y_lo);
 673 }
 674 
 675 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 676     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 677 }
 678 
 679 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 680   mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
 681   movptr(dst, rscratch1);
 682 }
 683 
 684 void MacroAssembler::leave() {
 685   // %%% is this really better? Why not on 32bit too?
 686   emit_int8((unsigned char)0xC9); // LEAVE
 687 }
 688 
 689 void MacroAssembler::lneg(Register hi, Register lo) {
 690   ShouldNotReachHere(); // 64bit doesn't use two regs
 691   negq(lo);
 692 }
 693 
 694 void MacroAssembler::movoop(Register dst, jobject obj) {
 695   mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 696 }
 697 
 698 void MacroAssembler::movoop(Address dst, jobject obj) {
 699   mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 700   movq(dst, rscratch1);
 701 }
 702 
 703 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 704   mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 705 }
 706 
 707 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 708   mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 709   movq(dst, rscratch1);
 710 }
 711 
 712 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 713   if (src.is_lval()) {
 714     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 715   } else {
 716     if (reachable(src)) {
 717       movq(dst, as_Address(src));
 718     } else {
 719       lea(scratch, src);
 720       movq(dst, Address(scratch, 0));
 721     }
 722   }
 723 }
 724 
 725 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 726   movq(as_Address(dst), src);
 727 }
 728 
 729 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 730   movq(dst, as_Address(src));
 731 }
 732 
 733 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 734 void MacroAssembler::movptr(Address dst, intptr_t src) {
 735   mov64(rscratch1, src);
 736   movq(dst, rscratch1);
 737 }
 738 
 739 // These are mostly for initializing NULL
 740 void MacroAssembler::movptr(Address dst, int32_t src) {
 741   movslq(dst, src);
 742 }
 743 
 744 void MacroAssembler::movptr(Register dst, int32_t src) {
 745   mov64(dst, (intptr_t)src);
 746 }
 747 
 748 void MacroAssembler::pushoop(jobject obj) {
 749   movoop(rscratch1, obj);
 750   push(rscratch1);
 751 }
 752 
 753 void MacroAssembler::pushklass(Metadata* obj) {
 754   mov_metadata(rscratch1, obj);
 755   push(rscratch1);
 756 }
 757 
 758 void MacroAssembler::pushptr(AddressLiteral src) {
 759   lea(rscratch1, src);
 760   if (src.is_lval()) {
 761     push(rscratch1);
 762   } else {
 763     pushq(Address(rscratch1, 0));
 764   }
 765 }
 766 
 767 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 768   // we must set sp to zero to clear frame
 769   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
 770   // must clear fp, so that compiled frames are not confused; it is
 771   // possible that we need it only for debugging
 772   if (clear_fp) {
 773     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
 774   }
 775 
 776   // Always clear the pc because it could have been set by make_walkable()
 777   movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
 778   vzeroupper();
 779 }
 780 
 781 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 782                                          Register last_java_fp,
 783                                          address  last_java_pc) {
 784   vzeroupper();
 785   // determine last_java_sp register
 786   if (!last_java_sp->is_valid()) {
 787     last_java_sp = rsp;
 788   }
 789 
 790   // last_java_fp is optional
 791   if (last_java_fp->is_valid()) {
 792     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
 793            last_java_fp);
 794   }
 795 
 796   // last_java_pc is optional
 797   if (last_java_pc != NULL) {
 798     Address java_pc(r15_thread,
 799                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
 800     lea(rscratch1, InternalAddress(last_java_pc));
 801     movptr(java_pc, rscratch1);
 802   }
 803 
 804   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
 805 }
 806 
 807 static void pass_arg0(MacroAssembler* masm, Register arg) {
 808   if (c_rarg0 != arg ) {
 809     masm->mov(c_rarg0, arg);
 810   }
 811 }
 812 
 813 static void pass_arg1(MacroAssembler* masm, Register arg) {
 814   if (c_rarg1 != arg ) {
 815     masm->mov(c_rarg1, arg);
 816   }
 817 }
 818 
 819 static void pass_arg2(MacroAssembler* masm, Register arg) {
 820   if (c_rarg2 != arg ) {
 821     masm->mov(c_rarg2, arg);
 822   }
 823 }
 824 
 825 static void pass_arg3(MacroAssembler* masm, Register arg) {
 826   if (c_rarg3 != arg ) {
 827     masm->mov(c_rarg3, arg);
 828   }
 829 }
 830 
 831 void MacroAssembler::stop(const char* msg) {
 832   address rip = pc();
 833   pusha(); // get regs on stack
 834   lea(c_rarg0, ExternalAddress((address) msg));
 835   lea(c_rarg1, InternalAddress(rip));
 836   movq(c_rarg2, rsp); // pass pointer to regs array
 837   andq(rsp, -16); // align stack as required by ABI
 838   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
 839   hlt();
 840 }
 841 
 842 void MacroAssembler::warn(const char* msg) {
 843   push(rbp);
 844   movq(rbp, rsp);
 845   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 846   push_CPU_state();   // keeps alignment at 16 bytes
 847   lea(c_rarg0, ExternalAddress((address) msg));
 848   lea(rax, ExternalAddress(CAST_FROM_FN_PTR(address, warning)));
 849   call(rax);
 850   pop_CPU_state();
 851   mov(rsp, rbp);
 852   pop(rbp);
 853 }
 854 
 855 void MacroAssembler::print_state() {
 856   address rip = pc();
 857   pusha();            // get regs on stack
 858   push(rbp);
 859   movq(rbp, rsp);
 860   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 861   push_CPU_state();   // keeps alignment at 16 bytes
 862 
 863   lea(c_rarg0, InternalAddress(rip));
 864   lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
 865   call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
 866 
 867   pop_CPU_state();
 868   mov(rsp, rbp);
 869   pop(rbp);
 870   popa();
 871 }
 872 
 873 #ifndef PRODUCT
 874 extern "C" void findpc(intptr_t x);
 875 #endif
 876 
 877 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
 878   // In order to get locks to work, we need to fake a in_VM state
 879   if (ShowMessageBoxOnError) {
 880     JavaThread* thread = JavaThread::current();
 881     JavaThreadState saved_state = thread->thread_state();
 882     thread->set_thread_state(_thread_in_vm);
 883 #ifndef PRODUCT
 884     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 885       ttyLocker ttyl;
 886       BytecodeCounter::print();
 887     }
 888 #endif
 889     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 890     // XXX correct this offset for amd64
 891     // This is the value of eip which points to where verify_oop will return.
 892     if (os::message_box(msg, "Execution stopped, print registers?")) {
 893       print_state64(pc, regs);
 894       BREAKPOINT;
 895       assert(false, "start up GDB");
 896     }
 897     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
 898   } else {
 899     ttyLocker ttyl;
 900     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
 901                     msg);
 902     assert(false, "DEBUG MESSAGE: %s", msg);
 903   }
 904 }
 905 
 906 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
 907   ttyLocker ttyl;
 908   FlagSetting fs(Debugging, true);
 909   tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
 910 #ifndef PRODUCT
 911   tty->cr();
 912   findpc(pc);
 913   tty->cr();
 914 #endif
 915 #define PRINT_REG(rax, value) \
 916   { tty->print("%s = ", #rax); os::print_location(tty, value); }
 917   PRINT_REG(rax, regs[15]);
 918   PRINT_REG(rbx, regs[12]);
 919   PRINT_REG(rcx, regs[14]);
 920   PRINT_REG(rdx, regs[13]);
 921   PRINT_REG(rdi, regs[8]);
 922   PRINT_REG(rsi, regs[9]);
 923   PRINT_REG(rbp, regs[10]);
 924   PRINT_REG(rsp, regs[11]);
 925   PRINT_REG(r8 , regs[7]);
 926   PRINT_REG(r9 , regs[6]);
 927   PRINT_REG(r10, regs[5]);
 928   PRINT_REG(r11, regs[4]);
 929   PRINT_REG(r12, regs[3]);
 930   PRINT_REG(r13, regs[2]);
 931   PRINT_REG(r14, regs[1]);
 932   PRINT_REG(r15, regs[0]);
 933 #undef PRINT_REG
 934   // Print some words near top of staack.
 935   int64_t* rsp = (int64_t*) regs[11];
 936   int64_t* dump_sp = rsp;
 937   for (int col1 = 0; col1 < 8; col1++) {
 938     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 939     os::print_location(tty, *dump_sp++);
 940   }
 941   for (int row = 0; row < 25; row++) {
 942     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 943     for (int col = 0; col < 4; col++) {
 944       tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
 945     }
 946     tty->cr();
 947   }
 948   // Print some instructions around pc:
 949   Disassembler::decode((address)pc-64, (address)pc);
 950   tty->print_cr("--------");
 951   Disassembler::decode((address)pc, (address)pc+32);
 952 }
 953 
 954 #endif // _LP64
 955 
 956 // Now versions that are common to 32/64 bit
 957 
 958 void MacroAssembler::addptr(Register dst, int32_t imm32) {
 959   LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
 960 }
 961 
 962 void MacroAssembler::addptr(Register dst, Register src) {
 963   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
 964 }
 965 
 966 void MacroAssembler::addptr(Address dst, Register src) {
 967   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
 968 }
 969 
 970 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
 971   if (reachable(src)) {
 972     Assembler::addsd(dst, as_Address(src));
 973   } else {
 974     lea(rscratch1, src);
 975     Assembler::addsd(dst, Address(rscratch1, 0));
 976   }
 977 }
 978 
 979 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
 980   if (reachable(src)) {
 981     addss(dst, as_Address(src));
 982   } else {
 983     lea(rscratch1, src);
 984     addss(dst, Address(rscratch1, 0));
 985   }
 986 }
 987 
 988 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src) {
 989   if (reachable(src)) {
 990     Assembler::addpd(dst, as_Address(src));
 991   } else {
 992     lea(rscratch1, src);
 993     Assembler::addpd(dst, Address(rscratch1, 0));
 994   }
 995 }
 996 
 997 void MacroAssembler::align(int modulus) {
 998   align(modulus, offset());
 999 }
1000 
1001 void MacroAssembler::align(int modulus, int target) {
1002   if (target % modulus != 0) {
1003     nop(modulus - (target % modulus));
1004   }
1005 }
1006 
1007 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1008   // Used in sign-masking with aligned address.
1009   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1010   if (reachable(src)) {
1011     Assembler::andpd(dst, as_Address(src));
1012   } else {
1013     lea(scratch_reg, src);
1014     Assembler::andpd(dst, Address(scratch_reg, 0));
1015   }
1016 }
1017 
1018 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1019   // Used in sign-masking with aligned address.
1020   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1021   if (reachable(src)) {
1022     Assembler::andps(dst, as_Address(src));
1023   } else {
1024     lea(scratch_reg, src);
1025     Assembler::andps(dst, Address(scratch_reg, 0));
1026   }
1027 }
1028 
1029 void MacroAssembler::andptr(Register dst, int32_t imm32) {
1030   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1031 }
1032 
1033 void MacroAssembler::atomic_incl(Address counter_addr) {
1034   lock();
1035   incrementl(counter_addr);
1036 }
1037 
1038 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1039   if (reachable(counter_addr)) {
1040     atomic_incl(as_Address(counter_addr));
1041   } else {
1042     lea(scr, counter_addr);
1043     atomic_incl(Address(scr, 0));
1044   }
1045 }
1046 
1047 #ifdef _LP64
1048 void MacroAssembler::atomic_incq(Address counter_addr) {
1049   lock();
1050   incrementq(counter_addr);
1051 }
1052 
1053 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1054   if (reachable(counter_addr)) {
1055     atomic_incq(as_Address(counter_addr));
1056   } else {
1057     lea(scr, counter_addr);
1058     atomic_incq(Address(scr, 0));
1059   }
1060 }
1061 #endif
1062 
1063 // Writes to stack successive pages until offset reached to check for
1064 // stack overflow + shadow pages.  This clobbers tmp.
1065 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1066   movptr(tmp, rsp);
1067   // Bang stack for total size given plus shadow page size.
1068   // Bang one page at a time because large size can bang beyond yellow and
1069   // red zones.
1070   Label loop;
1071   bind(loop);
1072   movl(Address(tmp, (-os::vm_page_size())), size );
1073   subptr(tmp, os::vm_page_size());
1074   subl(size, os::vm_page_size());
1075   jcc(Assembler::greater, loop);
1076 
1077   // Bang down shadow pages too.
1078   // At this point, (tmp-0) is the last address touched, so don't
1079   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1080   // was post-decremented.)  Skip this address by starting at i=1, and
1081   // touch a few more pages below.  N.B.  It is important to touch all
1082   // the way down including all pages in the shadow zone.
1083   for (int i = 1; i < ((int)JavaThread::stack_shadow_zone_size() / os::vm_page_size()); i++) {
1084     // this could be any sized move but this is can be a debugging crumb
1085     // so the bigger the better.
1086     movptr(Address(tmp, (-i*os::vm_page_size())), size );
1087   }
1088 }
1089 
1090 void MacroAssembler::reserved_stack_check() {
1091     // testing if reserved zone needs to be enabled
1092     Label no_reserved_zone_enabling;
1093     Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1094     NOT_LP64(get_thread(rsi);)
1095 
1096     cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1097     jcc(Assembler::below, no_reserved_zone_enabling);
1098 
1099     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1100     jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1101     should_not_reach_here();
1102 
1103     bind(no_reserved_zone_enabling);
1104 }
1105 
1106 int MacroAssembler::biased_locking_enter(Register lock_reg,
1107                                          Register obj_reg,
1108                                          Register swap_reg,
1109                                          Register tmp_reg,
1110                                          bool swap_reg_contains_mark,
1111                                          Label& done,
1112                                          Label* slow_case,
1113                                          BiasedLockingCounters* counters) {
1114   assert(UseBiasedLocking, "why call this otherwise?");
1115   assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
1116   assert(tmp_reg != noreg, "tmp_reg must be supplied");
1117   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
1118   assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits, "biased locking makes assumptions about bit layout");
1119   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
1120   NOT_LP64( Address saved_mark_addr(lock_reg, 0); )
1121 
1122   if (PrintBiasedLockingStatistics && counters == NULL) {
1123     counters = BiasedLocking::counters();
1124   }
1125   // Biased locking
1126   // See whether the lock is currently biased toward our thread and
1127   // whether the epoch is still valid
1128   // Note that the runtime guarantees sufficient alignment of JavaThread
1129   // pointers to allow age to be placed into low bits
1130   // First check to see whether biasing is even enabled for this object
1131   Label cas_label;
1132   int null_check_offset = -1;
1133   if (!swap_reg_contains_mark) {
1134     null_check_offset = offset();
1135     movptr(swap_reg, mark_addr);
1136   }
1137   movptr(tmp_reg, swap_reg);
1138   andptr(tmp_reg, markWord::biased_lock_mask_in_place);
1139   cmpptr(tmp_reg, markWord::biased_lock_pattern);
1140   jcc(Assembler::notEqual, cas_label);
1141   // The bias pattern is present in the object's header. Need to check
1142   // whether the bias owner and the epoch are both still current.
1143 #ifndef _LP64
1144   // Note that because there is no current thread register on x86_32 we
1145   // need to store off the mark word we read out of the object to
1146   // avoid reloading it and needing to recheck invariants below. This
1147   // store is unfortunate but it makes the overall code shorter and
1148   // simpler.
1149   movptr(saved_mark_addr, swap_reg);
1150 #endif
1151   if (swap_reg_contains_mark) {
1152     null_check_offset = offset();
1153   }
1154   load_prototype_header(tmp_reg, obj_reg);
1155 #ifdef _LP64
1156   orptr(tmp_reg, r15_thread);
1157   xorptr(tmp_reg, swap_reg);
1158   Register header_reg = tmp_reg;
1159 #else
1160   xorptr(tmp_reg, swap_reg);
1161   get_thread(swap_reg);
1162   xorptr(swap_reg, tmp_reg);
1163   Register header_reg = swap_reg;
1164 #endif
1165   andptr(header_reg, ~((int) markWord::age_mask_in_place));
1166   if (counters != NULL) {
1167     cond_inc32(Assembler::zero,
1168                ExternalAddress((address) counters->biased_lock_entry_count_addr()));
1169   }
1170   jcc(Assembler::equal, done);
1171 
1172   Label try_revoke_bias;
1173   Label try_rebias;
1174 
1175   // At this point we know that the header has the bias pattern and
1176   // that we are not the bias owner in the current epoch. We need to
1177   // figure out more details about the state of the header in order to
1178   // know what operations can be legally performed on the object's
1179   // header.
1180 
1181   // If the low three bits in the xor result aren't clear, that means
1182   // the prototype header is no longer biased and we have to revoke
1183   // the bias on this object.
1184   testptr(header_reg, markWord::biased_lock_mask_in_place);
1185   jccb(Assembler::notZero, try_revoke_bias);
1186 
1187   // Biasing is still enabled for this data type. See whether the
1188   // epoch of the current bias is still valid, meaning that the epoch
1189   // bits of the mark word are equal to the epoch bits of the
1190   // prototype header. (Note that the prototype header's epoch bits
1191   // only change at a safepoint.) If not, attempt to rebias the object
1192   // toward the current thread. Note that we must be absolutely sure
1193   // that the current epoch is invalid in order to do this because
1194   // otherwise the manipulations it performs on the mark word are
1195   // illegal.
1196   testptr(header_reg, markWord::epoch_mask_in_place);
1197   jccb(Assembler::notZero, try_rebias);
1198 
1199   // The epoch of the current bias is still valid but we know nothing
1200   // about the owner; it might be set or it might be clear. Try to
1201   // acquire the bias of the object using an atomic operation. If this
1202   // fails we will go in to the runtime to revoke the object's bias.
1203   // Note that we first construct the presumed unbiased header so we
1204   // don't accidentally blow away another thread's valid bias.
1205   NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1206   andptr(swap_reg,
1207          markWord::biased_lock_mask_in_place | markWord::age_mask_in_place | markWord::epoch_mask_in_place);
1208 #ifdef _LP64
1209   movptr(tmp_reg, swap_reg);
1210   orptr(tmp_reg, r15_thread);
1211 #else
1212   get_thread(tmp_reg);
1213   orptr(tmp_reg, swap_reg);
1214 #endif
1215   lock();
1216   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1217   // If the biasing toward our thread failed, this means that
1218   // another thread succeeded in biasing it toward itself and we
1219   // need to revoke that bias. The revocation will occur in the
1220   // interpreter runtime in the slow case.
1221   if (counters != NULL) {
1222     cond_inc32(Assembler::zero,
1223                ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
1224   }
1225   if (slow_case != NULL) {
1226     jcc(Assembler::notZero, *slow_case);
1227   }
1228   jmp(done);
1229 
1230   bind(try_rebias);
1231   // At this point we know the epoch has expired, meaning that the
1232   // current "bias owner", if any, is actually invalid. Under these
1233   // circumstances _only_, we are allowed to use the current header's
1234   // value as the comparison value when doing the cas to acquire the
1235   // bias in the current epoch. In other words, we allow transfer of
1236   // the bias from one thread to another directly in this situation.
1237   //
1238   // FIXME: due to a lack of registers we currently blow away the age
1239   // bits in this situation. Should attempt to preserve them.
1240   load_prototype_header(tmp_reg, obj_reg);
1241 #ifdef _LP64
1242   orptr(tmp_reg, r15_thread);
1243 #else
1244   get_thread(swap_reg);
1245   orptr(tmp_reg, swap_reg);
1246   movptr(swap_reg, saved_mark_addr);
1247 #endif
1248   lock();
1249   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1250   // If the biasing toward our thread failed, then another thread
1251   // succeeded in biasing it toward itself and we need to revoke that
1252   // bias. The revocation will occur in the runtime in the slow case.
1253   if (counters != NULL) {
1254     cond_inc32(Assembler::zero,
1255                ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
1256   }
1257   if (slow_case != NULL) {
1258     jcc(Assembler::notZero, *slow_case);
1259   }
1260   jmp(done);
1261 
1262   bind(try_revoke_bias);
1263   // The prototype mark in the klass doesn't have the bias bit set any
1264   // more, indicating that objects of this data type are not supposed
1265   // to be biased any more. We are going to try to reset the mark of
1266   // this object to the prototype value and fall through to the
1267   // CAS-based locking scheme. Note that if our CAS fails, it means
1268   // that another thread raced us for the privilege of revoking the
1269   // bias of this particular object, so it's okay to continue in the
1270   // normal locking code.
1271   //
1272   // FIXME: due to a lack of registers we currently blow away the age
1273   // bits in this situation. Should attempt to preserve them.
1274   NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1275   load_prototype_header(tmp_reg, obj_reg);
1276   lock();
1277   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1278   // Fall through to the normal CAS-based lock, because no matter what
1279   // the result of the above CAS, some thread must have succeeded in
1280   // removing the bias bit from the object's header.
1281   if (counters != NULL) {
1282     cond_inc32(Assembler::zero,
1283                ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
1284   }
1285 
1286   bind(cas_label);
1287 
1288   return null_check_offset;
1289 }
1290 
1291 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1292   assert(UseBiasedLocking, "why call this otherwise?");
1293 
1294   // Check for biased locking unlock case, which is a no-op
1295   // Note: we do not have to check the thread ID for two reasons.
1296   // First, the interpreter checks for IllegalMonitorStateException at
1297   // a higher level. Second, if the bias was revoked while we held the
1298   // lock, the object could not be rebiased toward another thread, so
1299   // the bias bit would be clear.
1300   movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1301   andptr(temp_reg, markWord::biased_lock_mask_in_place);
1302   cmpptr(temp_reg, markWord::biased_lock_pattern);
1303   jcc(Assembler::equal, done);
1304 }
1305 
1306 #ifdef COMPILER2
1307 
1308 #if INCLUDE_RTM_OPT
1309 
1310 // Update rtm_counters based on abort status
1311 // input: abort_status
1312 //        rtm_counters (RTMLockingCounters*)
1313 // flags are killed
1314 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
1315 
1316   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
1317   if (PrintPreciseRTMLockingStatistics) {
1318     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
1319       Label check_abort;
1320       testl(abort_status, (1<<i));
1321       jccb(Assembler::equal, check_abort);
1322       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
1323       bind(check_abort);
1324     }
1325   }
1326 }
1327 
1328 // Branch if (random & (count-1) != 0), count is 2^n
1329 // tmp, scr and flags are killed
1330 void MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
1331   assert(tmp == rax, "");
1332   assert(scr == rdx, "");
1333   rdtsc(); // modifies EDX:EAX
1334   andptr(tmp, count-1);
1335   jccb(Assembler::notZero, brLabel);
1336 }
1337 
1338 // Perform abort ratio calculation, set no_rtm bit if high ratio
1339 // input:  rtm_counters_Reg (RTMLockingCounters* address)
1340 // tmpReg, rtm_counters_Reg and flags are killed
1341 void MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
1342                                                  Register rtm_counters_Reg,
1343                                                  RTMLockingCounters* rtm_counters,
1344                                                  Metadata* method_data) {
1345   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
1346 
1347   if (RTMLockingCalculationDelay > 0) {
1348     // Delay calculation
1349     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
1350     testptr(tmpReg, tmpReg);
1351     jccb(Assembler::equal, L_done);
1352   }
1353   // Abort ratio calculation only if abort_count > RTMAbortThreshold
1354   //   Aborted transactions = abort_count * 100
1355   //   All transactions = total_count *  RTMTotalCountIncrRate
1356   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
1357 
1358   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
1359   cmpptr(tmpReg, RTMAbortThreshold);
1360   jccb(Assembler::below, L_check_always_rtm2);
1361   imulptr(tmpReg, tmpReg, 100);
1362 
1363   Register scrReg = rtm_counters_Reg;
1364   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1365   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
1366   imulptr(scrReg, scrReg, RTMAbortRatio);
1367   cmpptr(tmpReg, scrReg);
1368   jccb(Assembler::below, L_check_always_rtm1);
1369   if (method_data != NULL) {
1370     // set rtm_state to "no rtm" in MDO
1371     mov_metadata(tmpReg, method_data);
1372     lock();
1373     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
1374   }
1375   jmpb(L_done);
1376   bind(L_check_always_rtm1);
1377   // Reload RTMLockingCounters* address
1378   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1379   bind(L_check_always_rtm2);
1380   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1381   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
1382   jccb(Assembler::below, L_done);
1383   if (method_data != NULL) {
1384     // set rtm_state to "always rtm" in MDO
1385     mov_metadata(tmpReg, method_data);
1386     lock();
1387     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
1388   }
1389   bind(L_done);
1390 }
1391 
1392 // Update counters and perform abort ratio calculation
1393 // input:  abort_status_Reg
1394 // rtm_counters_Reg, flags are killed
1395 void MacroAssembler::rtm_profiling(Register abort_status_Reg,
1396                                    Register rtm_counters_Reg,
1397                                    RTMLockingCounters* rtm_counters,
1398                                    Metadata* method_data,
1399                                    bool profile_rtm) {
1400 
1401   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1402   // update rtm counters based on rax value at abort
1403   // reads abort_status_Reg, updates flags
1404   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1405   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
1406   if (profile_rtm) {
1407     // Save abort status because abort_status_Reg is used by following code.
1408     if (RTMRetryCount > 0) {
1409       push(abort_status_Reg);
1410     }
1411     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1412     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
1413     // restore abort status
1414     if (RTMRetryCount > 0) {
1415       pop(abort_status_Reg);
1416     }
1417   }
1418 }
1419 
1420 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
1421 // inputs: retry_count_Reg
1422 //       : abort_status_Reg
1423 // output: retry_count_Reg decremented by 1
1424 // flags are killed
1425 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
1426   Label doneRetry;
1427   assert(abort_status_Reg == rax, "");
1428   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
1429   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
1430   // if reason is in 0x6 and retry count != 0 then retry
1431   andptr(abort_status_Reg, 0x6);
1432   jccb(Assembler::zero, doneRetry);
1433   testl(retry_count_Reg, retry_count_Reg);
1434   jccb(Assembler::zero, doneRetry);
1435   pause();
1436   decrementl(retry_count_Reg);
1437   jmp(retryLabel);
1438   bind(doneRetry);
1439 }
1440 
1441 // Spin and retry if lock is busy,
1442 // inputs: box_Reg (monitor address)
1443 //       : retry_count_Reg
1444 // output: retry_count_Reg decremented by 1
1445 //       : clear z flag if retry count exceeded
1446 // tmp_Reg, scr_Reg, flags are killed
1447 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
1448                                             Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
1449   Label SpinLoop, SpinExit, doneRetry;
1450   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1451 
1452   testl(retry_count_Reg, retry_count_Reg);
1453   jccb(Assembler::zero, doneRetry);
1454   decrementl(retry_count_Reg);
1455   movptr(scr_Reg, RTMSpinLoopCount);
1456 
1457   bind(SpinLoop);
1458   pause();
1459   decrementl(scr_Reg);
1460   jccb(Assembler::lessEqual, SpinExit);
1461   movptr(tmp_Reg, Address(box_Reg, owner_offset));
1462   testptr(tmp_Reg, tmp_Reg);
1463   jccb(Assembler::notZero, SpinLoop);
1464 
1465   bind(SpinExit);
1466   jmp(retryLabel);
1467   bind(doneRetry);
1468   incrementl(retry_count_Reg); // clear z flag
1469 }
1470 
1471 // Use RTM for normal stack locks
1472 // Input: objReg (object to lock)
1473 void MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
1474                                        Register retry_on_abort_count_Reg,
1475                                        RTMLockingCounters* stack_rtm_counters,
1476                                        Metadata* method_data, bool profile_rtm,
1477                                        Label& DONE_LABEL, Label& IsInflated) {
1478   assert(UseRTMForStackLocks, "why call this otherwise?");
1479   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1480   assert(tmpReg == rax, "");
1481   assert(scrReg == rdx, "");
1482   Label L_rtm_retry, L_decrement_retry, L_on_abort;
1483 
1484   if (RTMRetryCount > 0) {
1485     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1486     bind(L_rtm_retry);
1487   }
1488   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
1489   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral|biased
1490   jcc(Assembler::notZero, IsInflated);
1491 
1492   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1493     Label L_noincrement;
1494     if (RTMTotalCountIncrRate > 1) {
1495       // tmpReg, scrReg and flags are killed
1496       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
1497     }
1498     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
1499     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
1500     bind(L_noincrement);
1501   }
1502   xbegin(L_on_abort);
1503   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
1504   andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
1505   cmpptr(tmpReg, markWord::unlocked_value);            // bits = 001 unlocked
1506   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
1507 
1508   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1509   if (UseRTMXendForLockBusy) {
1510     xend();
1511     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
1512     jmp(L_decrement_retry);
1513   }
1514   else {
1515     xabort(0);
1516   }
1517   bind(L_on_abort);
1518   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1519     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
1520   }
1521   bind(L_decrement_retry);
1522   if (RTMRetryCount > 0) {
1523     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1524     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1525   }
1526 }
1527 
1528 // Use RTM for inflating locks
1529 // inputs: objReg (object to lock)
1530 //         boxReg (on-stack box address (displaced header location) - KILLED)
1531 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
1532 void MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
1533                                           Register scrReg, Register retry_on_busy_count_Reg,
1534                                           Register retry_on_abort_count_Reg,
1535                                           RTMLockingCounters* rtm_counters,
1536                                           Metadata* method_data, bool profile_rtm,
1537                                           Label& DONE_LABEL) {
1538   assert(UseRTMLocking, "why call this otherwise?");
1539   assert(tmpReg == rax, "");
1540   assert(scrReg == rdx, "");
1541   Label L_rtm_retry, L_decrement_retry, L_on_abort;
1542   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1543 
1544   // Without cast to int32_t a movptr will destroy r10 which is typically obj
1545   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
1546   movptr(boxReg, tmpReg); // Save ObjectMonitor address
1547 
1548   if (RTMRetryCount > 0) {
1549     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
1550     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1551     bind(L_rtm_retry);
1552   }
1553   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1554     Label L_noincrement;
1555     if (RTMTotalCountIncrRate > 1) {
1556       // tmpReg, scrReg and flags are killed
1557       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
1558     }
1559     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1560     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
1561     bind(L_noincrement);
1562   }
1563   xbegin(L_on_abort);
1564   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
1565   movptr(tmpReg, Address(tmpReg, owner_offset));
1566   testptr(tmpReg, tmpReg);
1567   jcc(Assembler::zero, DONE_LABEL);
1568   if (UseRTMXendForLockBusy) {
1569     xend();
1570     jmp(L_decrement_retry);
1571   }
1572   else {
1573     xabort(0);
1574   }
1575   bind(L_on_abort);
1576   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1577   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1578     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
1579   }
1580   if (RTMRetryCount > 0) {
1581     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1582     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1583   }
1584 
1585   movptr(tmpReg, Address(boxReg, owner_offset)) ;
1586   testptr(tmpReg, tmpReg) ;
1587   jccb(Assembler::notZero, L_decrement_retry) ;
1588 
1589   // Appears unlocked - try to swing _owner from null to non-null.
1590   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1591 #ifdef _LP64
1592   Register threadReg = r15_thread;
1593 #else
1594   get_thread(scrReg);
1595   Register threadReg = scrReg;
1596 #endif
1597   lock();
1598   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
1599 
1600   if (RTMRetryCount > 0) {
1601     // success done else retry
1602     jccb(Assembler::equal, DONE_LABEL) ;
1603     bind(L_decrement_retry);
1604     // Spin and retry if lock is busy.
1605     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
1606   }
1607   else {
1608     bind(L_decrement_retry);
1609   }
1610 }
1611 
1612 #endif //  INCLUDE_RTM_OPT
1613 
1614 // Fast_Lock and Fast_Unlock used by C2
1615 
1616 // Because the transitions from emitted code to the runtime
1617 // monitorenter/exit helper stubs are so slow it's critical that
1618 // we inline both the stack-locking fast-path and the inflated fast path.
1619 //
1620 // See also: cmpFastLock and cmpFastUnlock.
1621 //
1622 // What follows is a specialized inline transliteration of the code
1623 // in slow_enter() and slow_exit().  If we're concerned about I$ bloat
1624 // another option would be to emit TrySlowEnter and TrySlowExit methods
1625 // at startup-time.  These methods would accept arguments as
1626 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
1627 // indications in the icc.ZFlag.  Fast_Lock and Fast_Unlock would simply
1628 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
1629 // In practice, however, the # of lock sites is bounded and is usually small.
1630 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
1631 // if the processor uses simple bimodal branch predictors keyed by EIP
1632 // Since the helper routines would be called from multiple synchronization
1633 // sites.
1634 //
1635 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
1636 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
1637 // to those specialized methods.  That'd give us a mostly platform-independent
1638 // implementation that the JITs could optimize and inline at their pleasure.
1639 // Done correctly, the only time we'd need to cross to native could would be
1640 // to park() or unpark() threads.  We'd also need a few more unsafe operators
1641 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
1642 // (b) explicit barriers or fence operations.
1643 //
1644 // TODO:
1645 //
1646 // *  Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
1647 //    This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
1648 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
1649 //    the lock operators would typically be faster than reifying Self.
1650 //
1651 // *  Ideally I'd define the primitives as:
1652 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
1653 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
1654 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
1655 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
1656 //    Furthermore the register assignments are overconstrained, possibly resulting in
1657 //    sub-optimal code near the synchronization site.
1658 //
1659 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
1660 //    Alternately, use a better sp-proximity test.
1661 //
1662 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
1663 //    Either one is sufficient to uniquely identify a thread.
1664 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
1665 //
1666 // *  Intrinsify notify() and notifyAll() for the common cases where the
1667 //    object is locked by the calling thread but the waitlist is empty.
1668 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
1669 //
1670 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
1671 //    But beware of excessive branch density on AMD Opterons.
1672 //
1673 // *  Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
1674 //    or failure of the fast-path.  If the fast-path fails then we pass
1675 //    control to the slow-path, typically in C.  In Fast_Lock and
1676 //    Fast_Unlock we often branch to DONE_LABEL, just to find that C2
1677 //    will emit a conditional branch immediately after the node.
1678 //    So we have branches to branches and lots of ICC.ZF games.
1679 //    Instead, it might be better to have C2 pass a "FailureLabel"
1680 //    into Fast_Lock and Fast_Unlock.  In the case of success, control
1681 //    will drop through the node.  ICC.ZF is undefined at exit.
1682 //    In the case of failure, the node will branch directly to the
1683 //    FailureLabel
1684 
1685 
1686 // obj: object to lock
1687 // box: on-stack box address (displaced header location) - KILLED
1688 // rax,: tmp -- KILLED
1689 // scr: tmp -- KILLED
1690 void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
1691                                Register scrReg, Register cx1Reg, Register cx2Reg,
1692                                BiasedLockingCounters* counters,
1693                                RTMLockingCounters* rtm_counters,
1694                                RTMLockingCounters* stack_rtm_counters,
1695                                Metadata* method_data,
1696                                bool use_rtm, bool profile_rtm) {
1697   // Ensure the register assignments are disjoint
1698   assert(tmpReg == rax, "");
1699 
1700   if (use_rtm) {
1701     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
1702   } else {
1703     assert(cx1Reg == noreg, "");
1704     assert(cx2Reg == noreg, "");
1705     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
1706   }
1707 
1708   if (counters != NULL) {
1709     atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
1710   }
1711 
1712   // Possible cases that we'll encounter in fast_lock
1713   // ------------------------------------------------
1714   // * Inflated
1715   //    -- unlocked
1716   //    -- Locked
1717   //       = by self
1718   //       = by other
1719   // * biased
1720   //    -- by Self
1721   //    -- by other
1722   // * neutral
1723   // * stack-locked
1724   //    -- by self
1725   //       = sp-proximity test hits
1726   //       = sp-proximity test generates false-negative
1727   //    -- by other
1728   //
1729 
1730   Label IsInflated, DONE_LABEL;
1731 
1732   // it's stack-locked, biased or neutral
1733   // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
1734   // order to reduce the number of conditional branches in the most common cases.
1735   // Beware -- there's a subtle invariant that fetch of the markword
1736   // at [FETCH], below, will never observe a biased encoding (*101b).
1737   // If this invariant is not held we risk exclusion (safety) failure.
1738   if (UseBiasedLocking && !UseOptoBiasInlining) {
1739     biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
1740   }
1741 
1742 #if INCLUDE_RTM_OPT
1743   if (UseRTMForStackLocks && use_rtm) {
1744     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
1745                       stack_rtm_counters, method_data, profile_rtm,
1746                       DONE_LABEL, IsInflated);
1747   }
1748 #endif // INCLUDE_RTM_OPT
1749 
1750   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
1751   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
1752   jccb(Assembler::notZero, IsInflated);
1753 
1754   // Attempt stack-locking ...
1755   orptr (tmpReg, markWord::unlocked_value);
1756   movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
1757   lock();
1758   cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
1759   if (counters != NULL) {
1760     cond_inc32(Assembler::equal,
1761                ExternalAddress((address)counters->fast_path_entry_count_addr()));
1762   }
1763   jcc(Assembler::equal, DONE_LABEL);           // Success
1764 
1765   // Recursive locking.
1766   // The object is stack-locked: markword contains stack pointer to BasicLock.
1767   // Locked by current thread if difference with current SP is less than one page.
1768   subptr(tmpReg, rsp);
1769   // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
1770   andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
1771   movptr(Address(boxReg, 0), tmpReg);
1772   if (counters != NULL) {
1773     cond_inc32(Assembler::equal,
1774                ExternalAddress((address)counters->fast_path_entry_count_addr()));
1775   }
1776   jmp(DONE_LABEL);
1777 
1778   bind(IsInflated);
1779   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
1780 
1781 #if INCLUDE_RTM_OPT
1782   // Use the same RTM locking code in 32- and 64-bit VM.
1783   if (use_rtm) {
1784     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
1785                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
1786   } else {
1787 #endif // INCLUDE_RTM_OPT
1788 
1789 #ifndef _LP64
1790   // The object is inflated.
1791 
1792   // boxReg refers to the on-stack BasicLock in the current frame.
1793   // We'd like to write:
1794   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
1795   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
1796   // additional latency as we have another ST in the store buffer that must drain.
1797 
1798   // avoid ST-before-CAS
1799   // register juggle because we need tmpReg for cmpxchgptr below
1800   movptr(scrReg, boxReg);
1801   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
1802 
1803   // Optimistic form: consider XORL tmpReg,tmpReg
1804   movptr(tmpReg, NULL_WORD);
1805 
1806   // Appears unlocked - try to swing _owner from null to non-null.
1807   // Ideally, I'd manifest "Self" with get_thread and then attempt
1808   // to CAS the register containing Self into m->Owner.
1809   // But we don't have enough registers, so instead we can either try to CAS
1810   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
1811   // we later store "Self" into m->Owner.  Transiently storing a stack address
1812   // (rsp or the address of the box) into  m->owner is harmless.
1813   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1814   lock();
1815   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1816   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
1817   // If we weren't able to swing _owner from NULL to the BasicLock
1818   // then take the slow path.
1819   jccb  (Assembler::notZero, DONE_LABEL);
1820   // update _owner from BasicLock to thread
1821   get_thread (scrReg);                    // beware: clobbers ICCs
1822   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
1823   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
1824 
1825   // If the CAS fails we can either retry or pass control to the slow-path.
1826   // We use the latter tactic.
1827   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1828   // If the CAS was successful ...
1829   //   Self has acquired the lock
1830   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1831   // Intentional fall-through into DONE_LABEL ...
1832 #else // _LP64
1833   // It's inflated
1834   movq(scrReg, tmpReg);
1835   xorq(tmpReg, tmpReg);
1836 
1837   lock();
1838   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1839   // Unconditionally set box->_displaced_header = markWord::unused_mark().
1840   // Without cast to int32_t movptr will destroy r10 which is typically obj.
1841   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
1842   // The following code to verify that the object field still refers
1843   // to the object we are trying to lock is not needed with safepoint
1844   // based deflation. It is also not needed with async deflation when
1845   // the DEFLATER_MARKER is allowed to linger in the owner field in an
1846   // async deflated ObjectMonitor until replaced by the next owner value.
1847   // We keep this code as a sanity check against bugs in other parts
1848   // of the async deflation mechanism.
1849   //
1850   // If we weren't able to swing _owner from NULL to r15_thread
1851   // then take the slow path.
1852   jccb(Assembler::notZero, DONE_LABEL);
1853   // r15_thread is now the owner so verify that the ObjectMonitor
1854   // still refers to the same object.
1855   cmpptr(objReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(object)));
1856   // The ObjectMonitor still refers to the same object so
1857   // r15_thread's ownership is valid.
1858   jccb(Assembler::zero, DONE_LABEL);
1859   // The ObjectMonitor does not refer to the same object so
1860   // drop ownership.
1861   movptr(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1862   // Intentional fall-through into DONE_LABEL ...
1863   // Propagate ICC.ZF from cmpptr() above into DONE_LABEL.
1864 #endif // _LP64
1865 #if INCLUDE_RTM_OPT
1866   } // use_rtm()
1867 #endif
1868   // DONE_LABEL is a hot target - we'd really like to place it at the
1869   // start of cache line by padding with NOPs.
1870   // See the AMD and Intel software optimization manuals for the
1871   // most efficient "long" NOP encodings.
1872   // Unfortunately none of our alignment mechanisms suffice.
1873   bind(DONE_LABEL);
1874 
1875   // At DONE_LABEL the icc ZFlag is set as follows ...
1876   // Fast_Unlock uses the same protocol.
1877   // ZFlag == 1 -> Success
1878   // ZFlag == 0 -> Failure - force control through the slow-path
1879 }
1880 
1881 // obj: object to unlock
1882 // box: box address (displaced header location), killed.  Must be EAX.
1883 // tmp: killed, cannot be obj nor box.
1884 //
1885 // Some commentary on balanced locking:
1886 //
1887 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
1888 // Methods that don't have provably balanced locking are forced to run in the
1889 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1890 // The interpreter provides two properties:
1891 // I1:  At return-time the interpreter automatically and quietly unlocks any
1892 //      objects acquired the current activation (frame).  Recall that the
1893 //      interpreter maintains an on-stack list of locks currently held by
1894 //      a frame.
1895 // I2:  If a method attempts to unlock an object that is not held by the
1896 //      the frame the interpreter throws IMSX.
1897 //
1898 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
1899 // B() doesn't have provably balanced locking so it runs in the interpreter.
1900 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
1901 // is still locked by A().
1902 //
1903 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
1904 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
1905 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
1906 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
1907 // Arguably given that the spec legislates the JNI case as undefined our implementation
1908 // could reasonably *avoid* checking owner in Fast_Unlock().
1909 // In the interest of performance we elide m->Owner==Self check in unlock.
1910 // A perfectly viable alternative is to elide the owner check except when
1911 // Xcheck:jni is enabled.
1912 
1913 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
1914   assert(boxReg == rax, "");
1915   assert_different_registers(objReg, boxReg, tmpReg);
1916 
1917   Label DONE_LABEL, Stacked, CheckSucc;
1918 
1919   // Critically, the biased locking test must have precedence over
1920   // and appear before the (box->dhw == 0) recursive stack-lock test.
1921   if (UseBiasedLocking && !UseOptoBiasInlining) {
1922     biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1923   }
1924 
1925 #if INCLUDE_RTM_OPT
1926   if (UseRTMForStackLocks && use_rtm) {
1927     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1928     Label L_regular_unlock;
1929     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
1930     andptr(tmpReg, markWord::biased_lock_mask_in_place);              // look at 3 lock bits
1931     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 001 unlocked
1932     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
1933     xend();                                                           // otherwise end...
1934     jmp(DONE_LABEL);                                                  // ... and we're done
1935     bind(L_regular_unlock);
1936   }
1937 #endif
1938 
1939   cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD);                   // Examine the displaced header
1940   jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
1941   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
1942   testptr(tmpReg, markWord::monitor_value);                         // Inflated?
1943   jccb  (Assembler::zero, Stacked);
1944 
1945   // It's inflated.
1946 #if INCLUDE_RTM_OPT
1947   if (use_rtm) {
1948     Label L_regular_inflated_unlock;
1949     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1950     movptr(boxReg, Address(tmpReg, owner_offset));
1951     testptr(boxReg, boxReg);
1952     jccb(Assembler::notZero, L_regular_inflated_unlock);
1953     xend();
1954     jmpb(DONE_LABEL);
1955     bind(L_regular_inflated_unlock);
1956   }
1957 #endif
1958 
1959   // Despite our balanced locking property we still check that m->_owner == Self
1960   // as java routines or native JNI code called by this thread might
1961   // have released the lock.
1962   // Refer to the comments in synchronizer.cpp for how we might encode extra
1963   // state in _succ so we can avoid fetching EntryList|cxq.
1964   //
1965   // I'd like to add more cases in fast_lock() and fast_unlock() --
1966   // such as recursive enter and exit -- but we have to be wary of
1967   // I$ bloat, T$ effects and BP$ effects.
1968   //
1969   // If there's no contention try a 1-0 exit.  That is, exit without
1970   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
1971   // we detect and recover from the race that the 1-0 exit admits.
1972   //
1973   // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
1974   // before it STs null into _owner, releasing the lock.  Updates
1975   // to data protected by the critical section must be visible before
1976   // we drop the lock (and thus before any other thread could acquire
1977   // the lock and observe the fields protected by the lock).
1978   // IA32's memory-model is SPO, so STs are ordered with respect to
1979   // each other and there's no need for an explicit barrier (fence).
1980   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
1981 #ifndef _LP64
1982   get_thread (boxReg);
1983 
1984   // Note that we could employ various encoding schemes to reduce
1985   // the number of loads below (currently 4) to just 2 or 3.
1986   // Refer to the comments in synchronizer.cpp.
1987   // In practice the chain of fetches doesn't seem to impact performance, however.
1988   xorptr(boxReg, boxReg);
1989   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1990   jccb  (Assembler::notZero, DONE_LABEL);
1991   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1992   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1993   jccb  (Assembler::notZero, CheckSucc);
1994   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1995   jmpb  (DONE_LABEL);
1996 
1997   bind (Stacked);
1998   // It's not inflated and it's not recursively stack-locked and it's not biased.
1999   // It must be stack-locked.
2000   // Try to reset the header to displaced header.
2001   // The "box" value on the stack is stable, so we can reload
2002   // and be assured we observe the same value as above.
2003   movptr(tmpReg, Address(boxReg, 0));
2004   lock();
2005   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
2006   // Intention fall-thru into DONE_LABEL
2007 
2008   // DONE_LABEL is a hot target - we'd really like to place it at the
2009   // start of cache line by padding with NOPs.
2010   // See the AMD and Intel software optimization manuals for the
2011   // most efficient "long" NOP encodings.
2012   // Unfortunately none of our alignment mechanisms suffice.
2013   bind (CheckSucc);
2014 #else // _LP64
2015   // It's inflated
2016   xorptr(boxReg, boxReg);
2017   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2018   jccb  (Assembler::notZero, DONE_LABEL);
2019   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2020   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2021   jccb  (Assembler::notZero, CheckSucc);
2022   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2023   jmpb  (DONE_LABEL);
2024 
2025   // Try to avoid passing control into the slow_path ...
2026   Label LSuccess, LGoSlowPath ;
2027   bind  (CheckSucc);
2028 
2029   // The following optional optimization can be elided if necessary
2030   // Effectively: if (succ == null) goto SlowPath
2031   // The code reduces the window for a race, however,
2032   // and thus benefits performance.
2033   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2034   jccb  (Assembler::zero, LGoSlowPath);
2035 
2036   xorptr(boxReg, boxReg);
2037   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2038 
2039   // Memory barrier/fence
2040   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
2041   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
2042   // This is faster on Nehalem and AMD Shanghai/Barcelona.
2043   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2044   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
2045   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
2046   lock(); addl(Address(rsp, 0), 0);
2047 
2048   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2049   jccb  (Assembler::notZero, LSuccess);
2050 
2051   // Rare inopportune interleaving - race.
2052   // The successor vanished in the small window above.
2053   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
2054   // We need to ensure progress and succession.
2055   // Try to reacquire the lock.
2056   // If that fails then the new owner is responsible for succession and this
2057   // thread needs to take no further action and can exit via the fast path (success).
2058   // If the re-acquire succeeds then pass control into the slow path.
2059   // As implemented, this latter mode is horrible because we generated more
2060   // coherence traffic on the lock *and* artifically extended the critical section
2061   // length while by virtue of passing control into the slow path.
2062 
2063   // box is really RAX -- the following CMPXCHG depends on that binding
2064   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
2065   lock();
2066   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2067   // There's no successor so we tried to regrab the lock.
2068   // If that didn't work, then another thread grabbed the
2069   // lock so we're done (and exit was a success).
2070   jccb  (Assembler::notEqual, LSuccess);
2071   // Intentional fall-through into slow-path
2072 
2073   bind  (LGoSlowPath);
2074   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2075   jmpb  (DONE_LABEL);
2076 
2077   bind  (LSuccess);
2078   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
2079   jmpb  (DONE_LABEL);
2080 
2081   bind  (Stacked);
2082   movptr(tmpReg, Address (boxReg, 0));      // re-fetch
2083   lock();
2084   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
2085 
2086 #endif
2087   bind(DONE_LABEL);
2088 }
2089 #endif // COMPILER2
2090 
2091 void MacroAssembler::c2bool(Register x) {
2092   // implements x == 0 ? 0 : 1
2093   // note: must only look at least-significant byte of x
2094   //       since C-style booleans are stored in one byte
2095   //       only! (was bug)
2096   andl(x, 0xFF);
2097   setb(Assembler::notZero, x);
2098 }
2099 
2100 // Wouldn't need if AddressLiteral version had new name
2101 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
2102   Assembler::call(L, rtype);
2103 }
2104 
2105 void MacroAssembler::call(Register entry) {
2106   Assembler::call(entry);
2107 }
2108 
2109 void MacroAssembler::call(AddressLiteral entry) {
2110   if (reachable(entry)) {
2111     Assembler::call_literal(entry.target(), entry.rspec());
2112   } else {
2113     lea(rscratch1, entry);
2114     Assembler::call(rscratch1);
2115   }
2116 }
2117 
2118 void MacroAssembler::ic_call(address entry, jint method_index) {
2119   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
2120   movptr(rax, (intptr_t)Universe::non_oop_word());
2121   call(AddressLiteral(entry, rh));
2122 }
2123 
2124 // Implementation of call_VM versions
2125 
2126 void MacroAssembler::call_VM(Register oop_result,
2127                              address entry_point,
2128                              bool check_exceptions) {
2129   Label C, E;
2130   call(C, relocInfo::none);
2131   jmp(E);
2132 
2133   bind(C);
2134   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
2135   ret(0);
2136 
2137   bind(E);
2138 }
2139 
2140 void MacroAssembler::call_VM(Register oop_result,
2141                              address entry_point,
2142                              Register arg_1,
2143                              bool check_exceptions) {
2144   Label C, E;
2145   call(C, relocInfo::none);
2146   jmp(E);
2147 
2148   bind(C);
2149   pass_arg1(this, arg_1);
2150   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
2151   ret(0);
2152 
2153   bind(E);
2154 }
2155 
2156 void MacroAssembler::call_VM(Register oop_result,
2157                              address entry_point,
2158                              Register arg_1,
2159                              Register arg_2,
2160                              bool check_exceptions) {
2161   Label C, E;
2162   call(C, relocInfo::none);
2163   jmp(E);
2164 
2165   bind(C);
2166 
2167   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2168 
2169   pass_arg2(this, arg_2);
2170   pass_arg1(this, arg_1);
2171   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
2172   ret(0);
2173 
2174   bind(E);
2175 }
2176 
2177 void MacroAssembler::call_VM(Register oop_result,
2178                              address entry_point,
2179                              Register arg_1,
2180                              Register arg_2,
2181                              Register arg_3,
2182                              bool check_exceptions) {
2183   Label C, E;
2184   call(C, relocInfo::none);
2185   jmp(E);
2186 
2187   bind(C);
2188 
2189   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2190   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2191   pass_arg3(this, arg_3);
2192 
2193   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2194   pass_arg2(this, arg_2);
2195 
2196   pass_arg1(this, arg_1);
2197   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
2198   ret(0);
2199 
2200   bind(E);
2201 }
2202 
2203 void MacroAssembler::call_VM(Register oop_result,
2204                              Register last_java_sp,
2205                              address entry_point,
2206                              int number_of_arguments,
2207                              bool check_exceptions) {
2208   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2209   call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2210 }
2211 
2212 void MacroAssembler::call_VM(Register oop_result,
2213                              Register last_java_sp,
2214                              address entry_point,
2215                              Register arg_1,
2216                              bool check_exceptions) {
2217   pass_arg1(this, arg_1);
2218   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2219 }
2220 
2221 void MacroAssembler::call_VM(Register oop_result,
2222                              Register last_java_sp,
2223                              address entry_point,
2224                              Register arg_1,
2225                              Register arg_2,
2226                              bool check_exceptions) {
2227 
2228   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2229   pass_arg2(this, arg_2);
2230   pass_arg1(this, arg_1);
2231   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2232 }
2233 
2234 void MacroAssembler::call_VM(Register oop_result,
2235                              Register last_java_sp,
2236                              address entry_point,
2237                              Register arg_1,
2238                              Register arg_2,
2239                              Register arg_3,
2240                              bool check_exceptions) {
2241   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2242   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2243   pass_arg3(this, arg_3);
2244   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2245   pass_arg2(this, arg_2);
2246   pass_arg1(this, arg_1);
2247   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2248 }
2249 
2250 void MacroAssembler::super_call_VM(Register oop_result,
2251                                    Register last_java_sp,
2252                                    address entry_point,
2253                                    int number_of_arguments,
2254                                    bool check_exceptions) {
2255   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2256   MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2257 }
2258 
2259 void MacroAssembler::super_call_VM(Register oop_result,
2260                                    Register last_java_sp,
2261                                    address entry_point,
2262                                    Register arg_1,
2263                                    bool check_exceptions) {
2264   pass_arg1(this, arg_1);
2265   super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2266 }
2267 
2268 void MacroAssembler::super_call_VM(Register oop_result,
2269                                    Register last_java_sp,
2270                                    address entry_point,
2271                                    Register arg_1,
2272                                    Register arg_2,
2273                                    bool check_exceptions) {
2274 
2275   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2276   pass_arg2(this, arg_2);
2277   pass_arg1(this, arg_1);
2278   super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2279 }
2280 
2281 void MacroAssembler::super_call_VM(Register oop_result,
2282                                    Register last_java_sp,
2283                                    address entry_point,
2284                                    Register arg_1,
2285                                    Register arg_2,
2286                                    Register arg_3,
2287                                    bool check_exceptions) {
2288   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2289   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2290   pass_arg3(this, arg_3);
2291   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2292   pass_arg2(this, arg_2);
2293   pass_arg1(this, arg_1);
2294   super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2295 }
2296 
2297 void MacroAssembler::call_VM_base(Register oop_result,
2298                                   Register java_thread,
2299                                   Register last_java_sp,
2300                                   address  entry_point,
2301                                   int      number_of_arguments,
2302                                   bool     check_exceptions) {
2303   // determine java_thread register
2304   if (!java_thread->is_valid()) {
2305 #ifdef _LP64
2306     java_thread = r15_thread;
2307 #else
2308     java_thread = rdi;
2309     get_thread(java_thread);
2310 #endif // LP64
2311   }
2312   // determine last_java_sp register
2313   if (!last_java_sp->is_valid()) {
2314     last_java_sp = rsp;
2315   }
2316   // debugging support
2317   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
2318   LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
2319 #ifdef ASSERT
2320   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
2321   // r12 is the heapbase.
2322   LP64_ONLY(if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
2323 #endif // ASSERT
2324 
2325   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
2326   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
2327 
2328   // push java thread (becomes first argument of C function)
2329 
2330   NOT_LP64(push(java_thread); number_of_arguments++);
2331   LP64_ONLY(mov(c_rarg0, r15_thread));
2332 
2333   // set last Java frame before call
2334   assert(last_java_sp != rbp, "can't use ebp/rbp");
2335 
2336   // Only interpreter should have to set fp
2337   set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
2338 
2339   // do the call, remove parameters
2340   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
2341 
2342   // restore the thread (cannot use the pushed argument since arguments
2343   // may be overwritten by C code generated by an optimizing compiler);
2344   // however can use the register value directly if it is callee saved.
2345   if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
2346     // rdi & rsi (also r15) are callee saved -> nothing to do
2347 #ifdef ASSERT
2348     guarantee(java_thread != rax, "change this code");
2349     push(rax);
2350     { Label L;
2351       get_thread(rax);
2352       cmpptr(java_thread, rax);
2353       jcc(Assembler::equal, L);
2354       STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
2355       bind(L);
2356     }
2357     pop(rax);
2358 #endif
2359   } else {
2360     get_thread(java_thread);
2361   }
2362   // reset last Java frame
2363   // Only interpreter should have to clear fp
2364   reset_last_Java_frame(java_thread, true);
2365 
2366    // C++ interp handles this in the interpreter
2367   check_and_handle_popframe(java_thread);
2368   check_and_handle_earlyret(java_thread);
2369 
2370   if (check_exceptions) {
2371     // check for pending exceptions (java_thread is set upon return)
2372     cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
2373 #ifndef _LP64
2374     jump_cc(Assembler::notEqual,
2375             RuntimeAddress(StubRoutines::forward_exception_entry()));
2376 #else
2377     // This used to conditionally jump to forward_exception however it is
2378     // possible if we relocate that the branch will not reach. So we must jump
2379     // around so we can always reach
2380 
2381     Label ok;
2382     jcc(Assembler::equal, ok);
2383     jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2384     bind(ok);
2385 #endif // LP64
2386   }
2387 
2388   // get oop result if there is one and reset the value in the thread
2389   if (oop_result->is_valid()) {
2390     get_vm_result(oop_result, java_thread);
2391   }
2392 }
2393 
2394 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
2395 
2396   // Calculate the value for last_Java_sp
2397   // somewhat subtle. call_VM does an intermediate call
2398   // which places a return address on the stack just under the
2399   // stack pointer as the user finsihed with it. This allows
2400   // use to retrieve last_Java_pc from last_Java_sp[-1].
2401   // On 32bit we then have to push additional args on the stack to accomplish
2402   // the actual requested call. On 64bit call_VM only can use register args
2403   // so the only extra space is the return address that call_VM created.
2404   // This hopefully explains the calculations here.
2405 
2406 #ifdef _LP64
2407   // We've pushed one address, correct last_Java_sp
2408   lea(rax, Address(rsp, wordSize));
2409 #else
2410   lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
2411 #endif // LP64
2412 
2413   call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
2414 
2415 }
2416 
2417 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
2418 void MacroAssembler::call_VM_leaf0(address entry_point) {
2419   MacroAssembler::call_VM_leaf_base(entry_point, 0);
2420 }
2421 
2422 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
2423   call_VM_leaf_base(entry_point, number_of_arguments);
2424 }
2425 
2426 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
2427   pass_arg0(this, arg_0);
2428   call_VM_leaf(entry_point, 1);
2429 }
2430 
2431 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2432 
2433   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2434   pass_arg1(this, arg_1);
2435   pass_arg0(this, arg_0);
2436   call_VM_leaf(entry_point, 2);
2437 }
2438 
2439 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2440   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2441   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2442   pass_arg2(this, arg_2);
2443   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2444   pass_arg1(this, arg_1);
2445   pass_arg0(this, arg_0);
2446   call_VM_leaf(entry_point, 3);
2447 }
2448 
2449 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
2450   pass_arg0(this, arg_0);
2451   MacroAssembler::call_VM_leaf_base(entry_point, 1);
2452 }
2453 
2454 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2455 
2456   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2457   pass_arg1(this, arg_1);
2458   pass_arg0(this, arg_0);
2459   MacroAssembler::call_VM_leaf_base(entry_point, 2);
2460 }
2461 
2462 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2463   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2464   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2465   pass_arg2(this, arg_2);
2466   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2467   pass_arg1(this, arg_1);
2468   pass_arg0(this, arg_0);
2469   MacroAssembler::call_VM_leaf_base(entry_point, 3);
2470 }
2471 
2472 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
2473   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
2474   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2475   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2476   pass_arg3(this, arg_3);
2477   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2478   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2479   pass_arg2(this, arg_2);
2480   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2481   pass_arg1(this, arg_1);
2482   pass_arg0(this, arg_0);
2483   MacroAssembler::call_VM_leaf_base(entry_point, 4);
2484 }
2485 
2486 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
2487   movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
2488   movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
2489   verify_oop(oop_result, "broken oop in call_VM_base");
2490 }
2491 
2492 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
2493   movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
2494   movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
2495 }
2496 
2497 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
2498 }
2499 
2500 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
2501 }
2502 
2503 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
2504   if (reachable(src1)) {
2505     cmpl(as_Address(src1), imm);
2506   } else {
2507     lea(rscratch1, src1);
2508     cmpl(Address(rscratch1, 0), imm);
2509   }
2510 }
2511 
2512 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
2513   assert(!src2.is_lval(), "use cmpptr");
2514   if (reachable(src2)) {
2515     cmpl(src1, as_Address(src2));
2516   } else {
2517     lea(rscratch1, src2);
2518     cmpl(src1, Address(rscratch1, 0));
2519   }
2520 }
2521 
2522 void MacroAssembler::cmp32(Register src1, int32_t imm) {
2523   Assembler::cmpl(src1, imm);
2524 }
2525 
2526 void MacroAssembler::cmp32(Register src1, Address src2) {
2527   Assembler::cmpl(src1, src2);
2528 }
2529 
2530 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2531   ucomisd(opr1, opr2);
2532 
2533   Label L;
2534   if (unordered_is_less) {
2535     movl(dst, -1);
2536     jcc(Assembler::parity, L);
2537     jcc(Assembler::below , L);
2538     movl(dst, 0);
2539     jcc(Assembler::equal , L);
2540     increment(dst);
2541   } else { // unordered is greater
2542     movl(dst, 1);
2543     jcc(Assembler::parity, L);
2544     jcc(Assembler::above , L);
2545     movl(dst, 0);
2546     jcc(Assembler::equal , L);
2547     decrementl(dst);
2548   }
2549   bind(L);
2550 }
2551 
2552 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2553   ucomiss(opr1, opr2);
2554 
2555   Label L;
2556   if (unordered_is_less) {
2557     movl(dst, -1);
2558     jcc(Assembler::parity, L);
2559     jcc(Assembler::below , L);
2560     movl(dst, 0);
2561     jcc(Assembler::equal , L);
2562     increment(dst);
2563   } else { // unordered is greater
2564     movl(dst, 1);
2565     jcc(Assembler::parity, L);
2566     jcc(Assembler::above , L);
2567     movl(dst, 0);
2568     jcc(Assembler::equal , L);
2569     decrementl(dst);
2570   }
2571   bind(L);
2572 }
2573 
2574 
2575 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
2576   if (reachable(src1)) {
2577     cmpb(as_Address(src1), imm);
2578   } else {
2579     lea(rscratch1, src1);
2580     cmpb(Address(rscratch1, 0), imm);
2581   }
2582 }
2583 
2584 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
2585 #ifdef _LP64
2586   if (src2.is_lval()) {
2587     movptr(rscratch1, src2);
2588     Assembler::cmpq(src1, rscratch1);
2589   } else if (reachable(src2)) {
2590     cmpq(src1, as_Address(src2));
2591   } else {
2592     lea(rscratch1, src2);
2593     Assembler::cmpq(src1, Address(rscratch1, 0));
2594   }
2595 #else
2596   if (src2.is_lval()) {
2597     cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2598   } else {
2599     cmpl(src1, as_Address(src2));
2600   }
2601 #endif // _LP64
2602 }
2603 
2604 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
2605   assert(src2.is_lval(), "not a mem-mem compare");
2606 #ifdef _LP64
2607   // moves src2's literal address
2608   movptr(rscratch1, src2);
2609   Assembler::cmpq(src1, rscratch1);
2610 #else
2611   cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2612 #endif // _LP64
2613 }
2614 
2615 void MacroAssembler::cmpoop(Register src1, Register src2) {
2616   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2617   bs->obj_equals(this, src1, src2);
2618 }
2619 
2620 void MacroAssembler::cmpoop(Register src1, Address src2) {
2621   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2622   bs->obj_equals(this, src1, src2);
2623 }
2624 
2625 #ifdef _LP64
2626 void MacroAssembler::cmpoop(Register src1, jobject src2) {
2627   movoop(rscratch1, src2);
2628   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2629   bs->obj_equals(this, src1, rscratch1);
2630 }
2631 #endif
2632 
2633 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
2634   if (reachable(adr)) {
2635     lock();
2636     cmpxchgptr(reg, as_Address(adr));
2637   } else {
2638     lea(rscratch1, adr);
2639     lock();
2640     cmpxchgptr(reg, Address(rscratch1, 0));
2641   }
2642 }
2643 
2644 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
2645   LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
2646 }
2647 
2648 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
2649   if (reachable(src)) {
2650     Assembler::comisd(dst, as_Address(src));
2651   } else {
2652     lea(rscratch1, src);
2653     Assembler::comisd(dst, Address(rscratch1, 0));
2654   }
2655 }
2656 
2657 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
2658   if (reachable(src)) {
2659     Assembler::comiss(dst, as_Address(src));
2660   } else {
2661     lea(rscratch1, src);
2662     Assembler::comiss(dst, Address(rscratch1, 0));
2663   }
2664 }
2665 
2666 
2667 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
2668   Condition negated_cond = negate_condition(cond);
2669   Label L;
2670   jcc(negated_cond, L);
2671   pushf(); // Preserve flags
2672   atomic_incl(counter_addr);
2673   popf();
2674   bind(L);
2675 }
2676 
2677 int MacroAssembler::corrected_idivl(Register reg) {
2678   // Full implementation of Java idiv and irem; checks for
2679   // special case as described in JVM spec., p.243 & p.271.
2680   // The function returns the (pc) offset of the idivl
2681   // instruction - may be needed for implicit exceptions.
2682   //
2683   //         normal case                           special case
2684   //
2685   // input : rax,: dividend                         min_int
2686   //         reg: divisor   (may not be rax,/rdx)   -1
2687   //
2688   // output: rax,: quotient  (= rax, idiv reg)       min_int
2689   //         rdx: remainder (= rax, irem reg)       0
2690   assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
2691   const int min_int = 0x80000000;
2692   Label normal_case, special_case;
2693 
2694   // check for special case
2695   cmpl(rax, min_int);
2696   jcc(Assembler::notEqual, normal_case);
2697   xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
2698   cmpl(reg, -1);
2699   jcc(Assembler::equal, special_case);
2700 
2701   // handle normal case
2702   bind(normal_case);
2703   cdql();
2704   int idivl_offset = offset();
2705   idivl(reg);
2706 
2707   // normal and special case exit
2708   bind(special_case);
2709 
2710   return idivl_offset;
2711 }
2712 
2713 
2714 
2715 void MacroAssembler::decrementl(Register reg, int value) {
2716   if (value == min_jint) {subl(reg, value) ; return; }
2717   if (value <  0) { incrementl(reg, -value); return; }
2718   if (value == 0) {                        ; return; }
2719   if (value == 1 && UseIncDec) { decl(reg) ; return; }
2720   /* else */      { subl(reg, value)       ; return; }
2721 }
2722 
2723 void MacroAssembler::decrementl(Address dst, int value) {
2724   if (value == min_jint) {subl(dst, value) ; return; }
2725   if (value <  0) { incrementl(dst, -value); return; }
2726   if (value == 0) {                        ; return; }
2727   if (value == 1 && UseIncDec) { decl(dst) ; return; }
2728   /* else */      { subl(dst, value)       ; return; }
2729 }
2730 
2731 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
2732   assert (shift_value > 0, "illegal shift value");
2733   Label _is_positive;
2734   testl (reg, reg);
2735   jcc (Assembler::positive, _is_positive);
2736   int offset = (1 << shift_value) - 1 ;
2737 
2738   if (offset == 1) {
2739     incrementl(reg);
2740   } else {
2741     addl(reg, offset);
2742   }
2743 
2744   bind (_is_positive);
2745   sarl(reg, shift_value);
2746 }
2747 
2748 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
2749   if (reachable(src)) {
2750     Assembler::divsd(dst, as_Address(src));
2751   } else {
2752     lea(rscratch1, src);
2753     Assembler::divsd(dst, Address(rscratch1, 0));
2754   }
2755 }
2756 
2757 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
2758   if (reachable(src)) {
2759     Assembler::divss(dst, as_Address(src));
2760   } else {
2761     lea(rscratch1, src);
2762     Assembler::divss(dst, Address(rscratch1, 0));
2763   }
2764 }
2765 
2766 // !defined(COMPILER2) is because of stupid core builds
2767 #if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2) || INCLUDE_JVMCI
2768 void MacroAssembler::empty_FPU_stack() {
2769   if (VM_Version::supports_mmx()) {
2770     emms();
2771   } else {
2772     for (int i = 8; i-- > 0; ) ffree(i);
2773   }
2774 }
2775 #endif // !LP64 || C1 || !C2 || INCLUDE_JVMCI
2776 
2777 
2778 void MacroAssembler::enter() {
2779   push(rbp);
2780   mov(rbp, rsp);
2781 }
2782 
2783 // A 5 byte nop that is safe for patching (see patch_verified_entry)
2784 void MacroAssembler::fat_nop() {
2785   if (UseAddressNop) {
2786     addr_nop_5();
2787   } else {
2788     emit_int8(0x26); // es:
2789     emit_int8(0x2e); // cs:
2790     emit_int8(0x64); // fs:
2791     emit_int8(0x65); // gs:
2792     emit_int8((unsigned char)0x90);
2793   }
2794 }
2795 
2796 void MacroAssembler::fcmp(Register tmp) {
2797   fcmp(tmp, 1, true, true);
2798 }
2799 
2800 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
2801   assert(!pop_right || pop_left, "usage error");
2802   if (VM_Version::supports_cmov()) {
2803     assert(tmp == noreg, "unneeded temp");
2804     if (pop_left) {
2805       fucomip(index);
2806     } else {
2807       fucomi(index);
2808     }
2809     if (pop_right) {
2810       fpop();
2811     }
2812   } else {
2813     assert(tmp != noreg, "need temp");
2814     if (pop_left) {
2815       if (pop_right) {
2816         fcompp();
2817       } else {
2818         fcomp(index);
2819       }
2820     } else {
2821       fcom(index);
2822     }
2823     // convert FPU condition into eflags condition via rax,
2824     save_rax(tmp);
2825     fwait(); fnstsw_ax();
2826     sahf();
2827     restore_rax(tmp);
2828   }
2829   // condition codes set as follows:
2830   //
2831   // CF (corresponds to C0) if x < y
2832   // PF (corresponds to C2) if unordered
2833   // ZF (corresponds to C3) if x = y
2834 }
2835 
2836 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
2837   fcmp2int(dst, unordered_is_less, 1, true, true);
2838 }
2839 
2840 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
2841   fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
2842   Label L;
2843   if (unordered_is_less) {
2844     movl(dst, -1);
2845     jcc(Assembler::parity, L);
2846     jcc(Assembler::below , L);
2847     movl(dst, 0);
2848     jcc(Assembler::equal , L);
2849     increment(dst);
2850   } else { // unordered is greater
2851     movl(dst, 1);
2852     jcc(Assembler::parity, L);
2853     jcc(Assembler::above , L);
2854     movl(dst, 0);
2855     jcc(Assembler::equal , L);
2856     decrementl(dst);
2857   }
2858   bind(L);
2859 }
2860 
2861 void MacroAssembler::fld_d(AddressLiteral src) {
2862   fld_d(as_Address(src));
2863 }
2864 
2865 void MacroAssembler::fld_s(AddressLiteral src) {
2866   fld_s(as_Address(src));
2867 }
2868 
2869 void MacroAssembler::fld_x(AddressLiteral src) {
2870   Assembler::fld_x(as_Address(src));
2871 }
2872 
2873 void MacroAssembler::fldcw(AddressLiteral src) {
2874   Assembler::fldcw(as_Address(src));
2875 }
2876 
2877 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
2878   if (reachable(src)) {
2879     Assembler::mulpd(dst, as_Address(src));
2880   } else {
2881     lea(rscratch1, src);
2882     Assembler::mulpd(dst, Address(rscratch1, 0));
2883   }
2884 }
2885 
2886 void MacroAssembler::increase_precision() {
2887   subptr(rsp, BytesPerWord);
2888   fnstcw(Address(rsp, 0));
2889   movl(rax, Address(rsp, 0));
2890   orl(rax, 0x300);
2891   push(rax);
2892   fldcw(Address(rsp, 0));
2893   pop(rax);
2894 }
2895 
2896 void MacroAssembler::restore_precision() {
2897   fldcw(Address(rsp, 0));
2898   addptr(rsp, BytesPerWord);
2899 }
2900 
2901 void MacroAssembler::fpop() {
2902   ffree();
2903   fincstp();
2904 }
2905 
2906 void MacroAssembler::load_float(Address src) {
2907   if (UseSSE >= 1) {
2908     movflt(xmm0, src);
2909   } else {
2910     LP64_ONLY(ShouldNotReachHere());
2911     NOT_LP64(fld_s(src));
2912   }
2913 }
2914 
2915 void MacroAssembler::store_float(Address dst) {
2916   if (UseSSE >= 1) {
2917     movflt(dst, xmm0);
2918   } else {
2919     LP64_ONLY(ShouldNotReachHere());
2920     NOT_LP64(fstp_s(dst));
2921   }
2922 }
2923 
2924 void MacroAssembler::load_double(Address src) {
2925   if (UseSSE >= 2) {
2926     movdbl(xmm0, src);
2927   } else {
2928     LP64_ONLY(ShouldNotReachHere());
2929     NOT_LP64(fld_d(src));
2930   }
2931 }
2932 
2933 void MacroAssembler::store_double(Address dst) {
2934   if (UseSSE >= 2) {
2935     movdbl(dst, xmm0);
2936   } else {
2937     LP64_ONLY(ShouldNotReachHere());
2938     NOT_LP64(fstp_d(dst));
2939   }
2940 }
2941 
2942 void MacroAssembler::fremr(Register tmp) {
2943   save_rax(tmp);
2944   { Label L;
2945     bind(L);
2946     fprem();
2947     fwait(); fnstsw_ax();
2948 #ifdef _LP64
2949     testl(rax, 0x400);
2950     jcc(Assembler::notEqual, L);
2951 #else
2952     sahf();
2953     jcc(Assembler::parity, L);
2954 #endif // _LP64
2955   }
2956   restore_rax(tmp);
2957   // Result is in ST0.
2958   // Note: fxch & fpop to get rid of ST1
2959   // (otherwise FPU stack could overflow eventually)
2960   fxch(1);
2961   fpop();
2962 }
2963 
2964 // dst = c = a * b + c
2965 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2966   Assembler::vfmadd231sd(c, a, b);
2967   if (dst != c) {
2968     movdbl(dst, c);
2969   }
2970 }
2971 
2972 // dst = c = a * b + c
2973 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2974   Assembler::vfmadd231ss(c, a, b);
2975   if (dst != c) {
2976     movflt(dst, c);
2977   }
2978 }
2979 
2980 // dst = c = a * b + c
2981 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2982   Assembler::vfmadd231pd(c, a, b, vector_len);
2983   if (dst != c) {
2984     vmovdqu(dst, c);
2985   }
2986 }
2987 
2988 // dst = c = a * b + c
2989 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2990   Assembler::vfmadd231ps(c, a, b, vector_len);
2991   if (dst != c) {
2992     vmovdqu(dst, c);
2993   }
2994 }
2995 
2996 // dst = c = a * b + c
2997 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2998   Assembler::vfmadd231pd(c, a, b, vector_len);
2999   if (dst != c) {
3000     vmovdqu(dst, c);
3001   }
3002 }
3003 
3004 // dst = c = a * b + c
3005 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
3006   Assembler::vfmadd231ps(c, a, b, vector_len);
3007   if (dst != c) {
3008     vmovdqu(dst, c);
3009   }
3010 }
3011 
3012 void MacroAssembler::incrementl(AddressLiteral dst) {
3013   if (reachable(dst)) {
3014     incrementl(as_Address(dst));
3015   } else {
3016     lea(rscratch1, dst);
3017     incrementl(Address(rscratch1, 0));
3018   }
3019 }
3020 
3021 void MacroAssembler::incrementl(ArrayAddress dst) {
3022   incrementl(as_Address(dst));
3023 }
3024 
3025 void MacroAssembler::incrementl(Register reg, int value) {
3026   if (value == min_jint) {addl(reg, value) ; return; }
3027   if (value <  0) { decrementl(reg, -value); return; }
3028   if (value == 0) {                        ; return; }
3029   if (value == 1 && UseIncDec) { incl(reg) ; return; }
3030   /* else */      { addl(reg, value)       ; return; }
3031 }
3032 
3033 void MacroAssembler::incrementl(Address dst, int value) {
3034   if (value == min_jint) {addl(dst, value) ; return; }
3035   if (value <  0) { decrementl(dst, -value); return; }
3036   if (value == 0) {                        ; return; }
3037   if (value == 1 && UseIncDec) { incl(dst) ; return; }
3038   /* else */      { addl(dst, value)       ; return; }
3039 }
3040 
3041 void MacroAssembler::jump(AddressLiteral dst) {
3042   if (reachable(dst)) {
3043     jmp_literal(dst.target(), dst.rspec());
3044   } else {
3045     lea(rscratch1, dst);
3046     jmp(rscratch1);
3047   }
3048 }
3049 
3050 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
3051   if (reachable(dst)) {
3052     InstructionMark im(this);
3053     relocate(dst.reloc());
3054     const int short_size = 2;
3055     const int long_size = 6;
3056     int offs = (intptr_t)dst.target() - ((intptr_t)pc());
3057     if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
3058       // 0111 tttn #8-bit disp
3059       emit_int8(0x70 | cc);
3060       emit_int8((offs - short_size) & 0xFF);
3061     } else {
3062       // 0000 1111 1000 tttn #32-bit disp
3063       emit_int8(0x0F);
3064       emit_int8((unsigned char)(0x80 | cc));
3065       emit_int32(offs - long_size);
3066     }
3067   } else {
3068 #ifdef ASSERT
3069     warning("reversing conditional branch");
3070 #endif /* ASSERT */
3071     Label skip;
3072     jccb(reverse[cc], skip);
3073     lea(rscratch1, dst);
3074     Assembler::jmp(rscratch1);
3075     bind(skip);
3076   }
3077 }
3078 
3079 void MacroAssembler::ldmxcsr(AddressLiteral src) {
3080   if (reachable(src)) {
3081     Assembler::ldmxcsr(as_Address(src));
3082   } else {
3083     lea(rscratch1, src);
3084     Assembler::ldmxcsr(Address(rscratch1, 0));
3085   }
3086 }
3087 
3088 int MacroAssembler::load_signed_byte(Register dst, Address src) {
3089   int off;
3090   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3091     off = offset();
3092     movsbl(dst, src); // movsxb
3093   } else {
3094     off = load_unsigned_byte(dst, src);
3095     shll(dst, 24);
3096     sarl(dst, 24);
3097   }
3098   return off;
3099 }
3100 
3101 // Note: load_signed_short used to be called load_signed_word.
3102 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
3103 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
3104 // The term "word" in HotSpot means a 32- or 64-bit machine word.
3105 int MacroAssembler::load_signed_short(Register dst, Address src) {
3106   int off;
3107   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3108     // This is dubious to me since it seems safe to do a signed 16 => 64 bit
3109     // version but this is what 64bit has always done. This seems to imply
3110     // that users are only using 32bits worth.
3111     off = offset();
3112     movswl(dst, src); // movsxw
3113   } else {
3114     off = load_unsigned_short(dst, src);
3115     shll(dst, 16);
3116     sarl(dst, 16);
3117   }
3118   return off;
3119 }
3120 
3121 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
3122   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3123   // and "3.9 Partial Register Penalties", p. 22).
3124   int off;
3125   if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
3126     off = offset();
3127     movzbl(dst, src); // movzxb
3128   } else {
3129     xorl(dst, dst);
3130     off = offset();
3131     movb(dst, src);
3132   }
3133   return off;
3134 }
3135 
3136 // Note: load_unsigned_short used to be called load_unsigned_word.
3137 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
3138   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3139   // and "3.9 Partial Register Penalties", p. 22).
3140   int off;
3141   if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
3142     off = offset();
3143     movzwl(dst, src); // movzxw
3144   } else {
3145     xorl(dst, dst);
3146     off = offset();
3147     movw(dst, src);
3148   }
3149   return off;
3150 }
3151 
3152 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
3153   switch (size_in_bytes) {
3154 #ifndef _LP64
3155   case  8:
3156     assert(dst2 != noreg, "second dest register required");
3157     movl(dst,  src);
3158     movl(dst2, src.plus_disp(BytesPerInt));
3159     break;
3160 #else
3161   case  8:  movq(dst, src); break;
3162 #endif
3163   case  4:  movl(dst, src); break;
3164   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
3165   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
3166   default:  ShouldNotReachHere();
3167   }
3168 }
3169 
3170 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
3171   switch (size_in_bytes) {
3172 #ifndef _LP64
3173   case  8:
3174     assert(src2 != noreg, "second source register required");
3175     movl(dst,                        src);
3176     movl(dst.plus_disp(BytesPerInt), src2);
3177     break;
3178 #else
3179   case  8:  movq(dst, src); break;
3180 #endif
3181   case  4:  movl(dst, src); break;
3182   case  2:  movw(dst, src); break;
3183   case  1:  movb(dst, src); break;
3184   default:  ShouldNotReachHere();
3185   }
3186 }
3187 
3188 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
3189   if (reachable(dst)) {
3190     movl(as_Address(dst), src);
3191   } else {
3192     lea(rscratch1, dst);
3193     movl(Address(rscratch1, 0), src);
3194   }
3195 }
3196 
3197 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
3198   if (reachable(src)) {
3199     movl(dst, as_Address(src));
3200   } else {
3201     lea(rscratch1, src);
3202     movl(dst, Address(rscratch1, 0));
3203   }
3204 }
3205 
3206 // C++ bool manipulation
3207 
3208 void MacroAssembler::movbool(Register dst, Address src) {
3209   if(sizeof(bool) == 1)
3210     movb(dst, src);
3211   else if(sizeof(bool) == 2)
3212     movw(dst, src);
3213   else if(sizeof(bool) == 4)
3214     movl(dst, src);
3215   else
3216     // unsupported
3217     ShouldNotReachHere();
3218 }
3219 
3220 void MacroAssembler::movbool(Address dst, bool boolconst) {
3221   if(sizeof(bool) == 1)
3222     movb(dst, (int) boolconst);
3223   else if(sizeof(bool) == 2)
3224     movw(dst, (int) boolconst);
3225   else if(sizeof(bool) == 4)
3226     movl(dst, (int) boolconst);
3227   else
3228     // unsupported
3229     ShouldNotReachHere();
3230 }
3231 
3232 void MacroAssembler::movbool(Address dst, Register src) {
3233   if(sizeof(bool) == 1)
3234     movb(dst, src);
3235   else if(sizeof(bool) == 2)
3236     movw(dst, src);
3237   else if(sizeof(bool) == 4)
3238     movl(dst, src);
3239   else
3240     // unsupported
3241     ShouldNotReachHere();
3242 }
3243 
3244 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
3245   movb(as_Address(dst), src);
3246 }
3247 
3248 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
3249   if (reachable(src)) {
3250     movdl(dst, as_Address(src));
3251   } else {
3252     lea(rscratch1, src);
3253     movdl(dst, Address(rscratch1, 0));
3254   }
3255 }
3256 
3257 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
3258   if (reachable(src)) {
3259     movq(dst, as_Address(src));
3260   } else {
3261     lea(rscratch1, src);
3262     movq(dst, Address(rscratch1, 0));
3263   }
3264 }
3265 
3266 #ifdef COMPILER2
3267 void MacroAssembler::setvectmask(Register dst, Register src) {
3268   guarantee(PostLoopMultiversioning, "must be");
3269   Assembler::movl(dst, 1);
3270   Assembler::shlxl(dst, dst, src);
3271   Assembler::decl(dst);
3272   Assembler::kmovdl(k1, dst);
3273   Assembler::movl(dst, src);
3274 }
3275 
3276 void MacroAssembler::restorevectmask() {
3277   guarantee(PostLoopMultiversioning, "must be");
3278   Assembler::knotwl(k1, k0);
3279 }
3280 #endif // COMPILER2
3281 
3282 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
3283   if (reachable(src)) {
3284     if (UseXmmLoadAndClearUpper) {
3285       movsd (dst, as_Address(src));
3286     } else {
3287       movlpd(dst, as_Address(src));
3288     }
3289   } else {
3290     lea(rscratch1, src);
3291     if (UseXmmLoadAndClearUpper) {
3292       movsd (dst, Address(rscratch1, 0));
3293     } else {
3294       movlpd(dst, Address(rscratch1, 0));
3295     }
3296   }
3297 }
3298 
3299 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
3300   if (reachable(src)) {
3301     movss(dst, as_Address(src));
3302   } else {
3303     lea(rscratch1, src);
3304     movss(dst, Address(rscratch1, 0));
3305   }
3306 }
3307 
3308 void MacroAssembler::movptr(Register dst, Register src) {
3309   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3310 }
3311 
3312 void MacroAssembler::movptr(Register dst, Address src) {
3313   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3314 }
3315 
3316 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
3317 void MacroAssembler::movptr(Register dst, intptr_t src) {
3318   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
3319 }
3320 
3321 void MacroAssembler::movptr(Address dst, Register src) {
3322   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3323 }
3324 
3325 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
3326     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3327     Assembler::movdqu(dst, src);
3328 }
3329 
3330 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
3331     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3332     Assembler::movdqu(dst, src);
3333 }
3334 
3335 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
3336     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3337     Assembler::movdqu(dst, src);
3338 }
3339 
3340 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
3341   if (reachable(src)) {
3342     movdqu(dst, as_Address(src));
3343   } else {
3344     lea(scratchReg, src);
3345     movdqu(dst, Address(scratchReg, 0));
3346   }
3347 }
3348 
3349 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
3350     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3351     Assembler::vmovdqu(dst, src);
3352 }
3353 
3354 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
3355     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3356     Assembler::vmovdqu(dst, src);
3357 }
3358 
3359 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
3360     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3361     Assembler::vmovdqu(dst, src);
3362 }
3363 
3364 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3365   if (reachable(src)) {
3366     vmovdqu(dst, as_Address(src));
3367   }
3368   else {
3369     lea(scratch_reg, src);
3370     vmovdqu(dst, Address(scratch_reg, 0));
3371   }
3372 }
3373 
3374 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3375   if (reachable(src)) {
3376     Assembler::evmovdquq(dst, as_Address(src), vector_len);
3377   } else {
3378     lea(rscratch, src);
3379     Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
3380   }
3381 }
3382 
3383 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
3384   if (reachable(src)) {
3385     Assembler::movdqa(dst, as_Address(src));
3386   } else {
3387     lea(rscratch1, src);
3388     Assembler::movdqa(dst, Address(rscratch1, 0));
3389   }
3390 }
3391 
3392 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
3393   if (reachable(src)) {
3394     Assembler::movsd(dst, as_Address(src));
3395   } else {
3396     lea(rscratch1, src);
3397     Assembler::movsd(dst, Address(rscratch1, 0));
3398   }
3399 }
3400 
3401 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
3402   if (reachable(src)) {
3403     Assembler::movss(dst, as_Address(src));
3404   } else {
3405     lea(rscratch1, src);
3406     Assembler::movss(dst, Address(rscratch1, 0));
3407   }
3408 }
3409 
3410 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
3411   if (reachable(src)) {
3412     Assembler::mulsd(dst, as_Address(src));
3413   } else {
3414     lea(rscratch1, src);
3415     Assembler::mulsd(dst, Address(rscratch1, 0));
3416   }
3417 }
3418 
3419 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
3420   if (reachable(src)) {
3421     Assembler::mulss(dst, as_Address(src));
3422   } else {
3423     lea(rscratch1, src);
3424     Assembler::mulss(dst, Address(rscratch1, 0));
3425   }
3426 }
3427 
3428 void MacroAssembler::null_check(Register reg, int offset) {
3429   if (needs_explicit_null_check(offset)) {
3430     // provoke OS NULL exception if reg = NULL by
3431     // accessing M[reg] w/o changing any (non-CC) registers
3432     // NOTE: cmpl is plenty here to provoke a segv
3433     cmpptr(rax, Address(reg, 0));
3434     // Note: should probably use testl(rax, Address(reg, 0));
3435     //       may be shorter code (however, this version of
3436     //       testl needs to be implemented first)
3437   } else {
3438     // nothing to do, (later) access of M[reg + offset]
3439     // will provoke OS NULL exception if reg = NULL
3440   }
3441 }
3442 
3443 void MacroAssembler::os_breakpoint() {
3444   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
3445   // (e.g., MSVC can't call ps() otherwise)
3446   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
3447 }
3448 
3449 void MacroAssembler::unimplemented(const char* what) {
3450   const char* buf = NULL;
3451   {
3452     ResourceMark rm;
3453     stringStream ss;
3454     ss.print("unimplemented: %s", what);
3455     buf = code_string(ss.as_string());
3456   }
3457   stop(buf);
3458 }
3459 
3460 #ifdef _LP64
3461 #define XSTATE_BV 0x200
3462 #endif
3463 
3464 void MacroAssembler::pop_CPU_state() {
3465   pop_FPU_state();
3466   pop_IU_state();
3467 }
3468 
3469 void MacroAssembler::pop_FPU_state() {
3470 #ifndef _LP64
3471   frstor(Address(rsp, 0));
3472 #else
3473   fxrstor(Address(rsp, 0));
3474 #endif
3475   addptr(rsp, FPUStateSizeInWords * wordSize);
3476 }
3477 
3478 void MacroAssembler::pop_IU_state() {
3479   popa();
3480   LP64_ONLY(addq(rsp, 8));
3481   popf();
3482 }
3483 
3484 // Save Integer and Float state
3485 // Warning: Stack must be 16 byte aligned (64bit)
3486 void MacroAssembler::push_CPU_state() {
3487   push_IU_state();
3488   push_FPU_state();
3489 }
3490 
3491 void MacroAssembler::push_FPU_state() {
3492   subptr(rsp, FPUStateSizeInWords * wordSize);
3493 #ifndef _LP64
3494   fnsave(Address(rsp, 0));
3495   fwait();
3496 #else
3497   fxsave(Address(rsp, 0));
3498 #endif // LP64
3499 }
3500 
3501 void MacroAssembler::push_IU_state() {
3502   // Push flags first because pusha kills them
3503   pushf();
3504   // Make sure rsp stays 16-byte aligned
3505   LP64_ONLY(subq(rsp, 8));
3506   pusha();
3507 }
3508 
3509 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register
3510   if (!java_thread->is_valid()) {
3511     java_thread = rdi;
3512     get_thread(java_thread);
3513   }
3514   // we must set sp to zero to clear frame
3515   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
3516   if (clear_fp) {
3517     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3518   }
3519 
3520   // Always clear the pc because it could have been set by make_walkable()
3521   movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
3522 
3523   vzeroupper();
3524 }
3525 
3526 void MacroAssembler::restore_rax(Register tmp) {
3527   if (tmp == noreg) pop(rax);
3528   else if (tmp != rax) mov(rax, tmp);
3529 }
3530 
3531 void MacroAssembler::round_to(Register reg, int modulus) {
3532   addptr(reg, modulus - 1);
3533   andptr(reg, -modulus);
3534 }
3535 
3536 void MacroAssembler::save_rax(Register tmp) {
3537   if (tmp == noreg) push(rax);
3538   else if (tmp != rax) mov(tmp, rax);
3539 }
3540 
3541 void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, Register temp_reg) {
3542   if (SafepointMechanism::uses_thread_local_poll()) {
3543 #ifdef _LP64
3544     assert(thread_reg == r15_thread, "should be");
3545 #else
3546     if (thread_reg == noreg) {
3547       thread_reg = temp_reg;
3548       get_thread(thread_reg);
3549     }
3550 #endif
3551     testb(Address(thread_reg, Thread::polling_page_offset()), SafepointMechanism::poll_bit());
3552     jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
3553   } else {
3554     cmp32(ExternalAddress(SafepointSynchronize::address_of_state()),
3555         SafepointSynchronize::_not_synchronized);
3556     jcc(Assembler::notEqual, slow_path);
3557   }
3558 }
3559 
3560 // Calls to C land
3561 //
3562 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
3563 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
3564 // has to be reset to 0. This is required to allow proper stack traversal.
3565 void MacroAssembler::set_last_Java_frame(Register java_thread,
3566                                          Register last_java_sp,
3567                                          Register last_java_fp,
3568                                          address  last_java_pc) {
3569   vzeroupper();
3570   // determine java_thread register
3571   if (!java_thread->is_valid()) {
3572     java_thread = rdi;
3573     get_thread(java_thread);
3574   }
3575   // determine last_java_sp register
3576   if (!last_java_sp->is_valid()) {
3577     last_java_sp = rsp;
3578   }
3579 
3580   // last_java_fp is optional
3581 
3582   if (last_java_fp->is_valid()) {
3583     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
3584   }
3585 
3586   // last_java_pc is optional
3587 
3588   if (last_java_pc != NULL) {
3589     lea(Address(java_thread,
3590                  JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
3591         InternalAddress(last_java_pc));
3592 
3593   }
3594   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
3595 }
3596 
3597 void MacroAssembler::shlptr(Register dst, int imm8) {
3598   LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
3599 }
3600 
3601 void MacroAssembler::shrptr(Register dst, int imm8) {
3602   LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
3603 }
3604 
3605 void MacroAssembler::sign_extend_byte(Register reg) {
3606   if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
3607     movsbl(reg, reg); // movsxb
3608   } else {
3609     shll(reg, 24);
3610     sarl(reg, 24);
3611   }
3612 }
3613 
3614 void MacroAssembler::sign_extend_short(Register reg) {
3615   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3616     movswl(reg, reg); // movsxw
3617   } else {
3618     shll(reg, 16);
3619     sarl(reg, 16);
3620   }
3621 }
3622 
3623 void MacroAssembler::testl(Register dst, AddressLiteral src) {
3624   assert(reachable(src), "Address should be reachable");
3625   testl(dst, as_Address(src));
3626 }
3627 
3628 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
3629   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3630   Assembler::pcmpeqb(dst, src);
3631 }
3632 
3633 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
3634   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3635   Assembler::pcmpeqw(dst, src);
3636 }
3637 
3638 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
3639   assert((dst->encoding() < 16),"XMM register should be 0-15");
3640   Assembler::pcmpestri(dst, src, imm8);
3641 }
3642 
3643 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
3644   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3645   Assembler::pcmpestri(dst, src, imm8);
3646 }
3647 
3648 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
3649   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3650   Assembler::pmovzxbw(dst, src);
3651 }
3652 
3653 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
3654   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3655   Assembler::pmovzxbw(dst, src);
3656 }
3657 
3658 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
3659   assert((src->encoding() < 16),"XMM register should be 0-15");
3660   Assembler::pmovmskb(dst, src);
3661 }
3662 
3663 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
3664   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3665   Assembler::ptest(dst, src);
3666 }
3667 
3668 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
3669   if (reachable(src)) {
3670     Assembler::sqrtsd(dst, as_Address(src));
3671   } else {
3672     lea(rscratch1, src);
3673     Assembler::sqrtsd(dst, Address(rscratch1, 0));
3674   }
3675 }
3676 
3677 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
3678   if (reachable(src)) {
3679     Assembler::sqrtss(dst, as_Address(src));
3680   } else {
3681     lea(rscratch1, src);
3682     Assembler::sqrtss(dst, Address(rscratch1, 0));
3683   }
3684 }
3685 
3686 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
3687   if (reachable(src)) {
3688     Assembler::subsd(dst, as_Address(src));
3689   } else {
3690     lea(rscratch1, src);
3691     Assembler::subsd(dst, Address(rscratch1, 0));
3692   }
3693 }
3694 
3695 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
3696   if (reachable(src)) {
3697     Assembler::subss(dst, as_Address(src));
3698   } else {
3699     lea(rscratch1, src);
3700     Assembler::subss(dst, Address(rscratch1, 0));
3701   }
3702 }
3703 
3704 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
3705   if (reachable(src)) {
3706     Assembler::ucomisd(dst, as_Address(src));
3707   } else {
3708     lea(rscratch1, src);
3709     Assembler::ucomisd(dst, Address(rscratch1, 0));
3710   }
3711 }
3712 
3713 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
3714   if (reachable(src)) {
3715     Assembler::ucomiss(dst, as_Address(src));
3716   } else {
3717     lea(rscratch1, src);
3718     Assembler::ucomiss(dst, Address(rscratch1, 0));
3719   }
3720 }
3721 
3722 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3723   // Used in sign-bit flipping with aligned address.
3724   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3725   if (reachable(src)) {
3726     Assembler::xorpd(dst, as_Address(src));
3727   } else {
3728     lea(scratch_reg, src);
3729     Assembler::xorpd(dst, Address(scratch_reg, 0));
3730   }
3731 }
3732 
3733 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
3734   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3735     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3736   }
3737   else {
3738     Assembler::xorpd(dst, src);
3739   }
3740 }
3741 
3742 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
3743   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3744     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3745   } else {
3746     Assembler::xorps(dst, src);
3747   }
3748 }
3749 
3750 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3751   // Used in sign-bit flipping with aligned address.
3752   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3753   if (reachable(src)) {
3754     Assembler::xorps(dst, as_Address(src));
3755   } else {
3756     lea(scratch_reg, src);
3757     Assembler::xorps(dst, Address(scratch_reg, 0));
3758   }
3759 }
3760 
3761 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
3762   // Used in sign-bit flipping with aligned address.
3763   bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3764   assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3765   if (reachable(src)) {
3766     Assembler::pshufb(dst, as_Address(src));
3767   } else {
3768     lea(rscratch1, src);
3769     Assembler::pshufb(dst, Address(rscratch1, 0));
3770   }
3771 }
3772 
3773 // AVX 3-operands instructions
3774 
3775 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3776   if (reachable(src)) {
3777     vaddsd(dst, nds, as_Address(src));
3778   } else {
3779     lea(rscratch1, src);
3780     vaddsd(dst, nds, Address(rscratch1, 0));
3781   }
3782 }
3783 
3784 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3785   if (reachable(src)) {
3786     vaddss(dst, nds, as_Address(src));
3787   } else {
3788     lea(rscratch1, src);
3789     vaddss(dst, nds, Address(rscratch1, 0));
3790   }
3791 }
3792 
3793 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3794   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3795   vandps(dst, nds, negate_field, vector_len);
3796 }
3797 
3798 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3799   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3800   vandpd(dst, nds, negate_field, vector_len);
3801 }
3802 
3803 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3804   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3805   Assembler::vpaddb(dst, nds, src, vector_len);
3806 }
3807 
3808 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3809   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3810   Assembler::vpaddb(dst, nds, src, vector_len);
3811 }
3812 
3813 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3814   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3815   Assembler::vpaddw(dst, nds, src, vector_len);
3816 }
3817 
3818 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3819   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3820   Assembler::vpaddw(dst, nds, src, vector_len);
3821 }
3822 
3823 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3824   if (reachable(src)) {
3825     Assembler::vpand(dst, nds, as_Address(src), vector_len);
3826   } else {
3827     lea(scratch_reg, src);
3828     Assembler::vpand(dst, nds, Address(scratch_reg, 0), vector_len);
3829   }
3830 }
3831 
3832 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
3833   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3834   Assembler::vpbroadcastw(dst, src, vector_len);
3835 }
3836 
3837 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3838   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3839   Assembler::vpcmpeqb(dst, nds, src, vector_len);
3840 }
3841 
3842 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3843   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3844   Assembler::vpcmpeqw(dst, nds, src, vector_len);
3845 }
3846 
3847 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3848   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3849   Assembler::vpmovzxbw(dst, src, vector_len);
3850 }
3851 
3852 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src) {
3853   assert((src->encoding() < 16),"XMM register should be 0-15");
3854   Assembler::vpmovmskb(dst, src);
3855 }
3856 
3857 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3858   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3859   Assembler::vpmullw(dst, nds, src, vector_len);
3860 }
3861 
3862 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3863   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3864   Assembler::vpmullw(dst, nds, src, vector_len);
3865 }
3866 
3867 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3868   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3869   Assembler::vpsubb(dst, nds, src, vector_len);
3870 }
3871 
3872 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3873   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3874   Assembler::vpsubb(dst, nds, src, vector_len);
3875 }
3876 
3877 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3878   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3879   Assembler::vpsubw(dst, nds, src, vector_len);
3880 }
3881 
3882 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3883   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3884   Assembler::vpsubw(dst, nds, src, vector_len);
3885 }
3886 
3887 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3888   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3889   Assembler::vpsraw(dst, nds, shift, vector_len);
3890 }
3891 
3892 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3893   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3894   Assembler::vpsraw(dst, nds, shift, vector_len);
3895 }
3896 
3897 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3898   assert(UseAVX > 2,"");
3899   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3900      vector_len = 2;
3901   }
3902   Assembler::evpsraq(dst, nds, shift, vector_len);
3903 }
3904 
3905 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3906   assert(UseAVX > 2,"");
3907   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3908      vector_len = 2;
3909   }
3910   Assembler::evpsraq(dst, nds, shift, vector_len);
3911 }
3912 
3913 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3914   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3915   Assembler::vpsrlw(dst, nds, shift, vector_len);
3916 }
3917 
3918 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3919   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3920   Assembler::vpsrlw(dst, nds, shift, vector_len);
3921 }
3922 
3923 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3924   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3925   Assembler::vpsllw(dst, nds, shift, vector_len);
3926 }
3927 
3928 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3929   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3930   Assembler::vpsllw(dst, nds, shift, vector_len);
3931 }
3932 
3933 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3934   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3935   Assembler::vptest(dst, src);
3936 }
3937 
3938 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3939   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3940   Assembler::punpcklbw(dst, src);
3941 }
3942 
3943 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3944   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3945   Assembler::pshufd(dst, src, mode);
3946 }
3947 
3948 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3949   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3950   Assembler::pshuflw(dst, src, mode);
3951 }
3952 
3953 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3954   if (reachable(src)) {
3955     vandpd(dst, nds, as_Address(src), vector_len);
3956   } else {
3957     lea(scratch_reg, src);
3958     vandpd(dst, nds, Address(scratch_reg, 0), vector_len);
3959   }
3960 }
3961 
3962 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3963   if (reachable(src)) {
3964     vandps(dst, nds, as_Address(src), vector_len);
3965   } else {
3966     lea(scratch_reg, src);
3967     vandps(dst, nds, Address(scratch_reg, 0), vector_len);
3968   }
3969 }
3970 
3971 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3972   if (reachable(src)) {
3973     vdivsd(dst, nds, as_Address(src));
3974   } else {
3975     lea(rscratch1, src);
3976     vdivsd(dst, nds, Address(rscratch1, 0));
3977   }
3978 }
3979 
3980 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3981   if (reachable(src)) {
3982     vdivss(dst, nds, as_Address(src));
3983   } else {
3984     lea(rscratch1, src);
3985     vdivss(dst, nds, Address(rscratch1, 0));
3986   }
3987 }
3988 
3989 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3990   if (reachable(src)) {
3991     vmulsd(dst, nds, as_Address(src));
3992   } else {
3993     lea(rscratch1, src);
3994     vmulsd(dst, nds, Address(rscratch1, 0));
3995   }
3996 }
3997 
3998 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3999   if (reachable(src)) {
4000     vmulss(dst, nds, as_Address(src));
4001   } else {
4002     lea(rscratch1, src);
4003     vmulss(dst, nds, Address(rscratch1, 0));
4004   }
4005 }
4006 
4007 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4008   if (reachable(src)) {
4009     vsubsd(dst, nds, as_Address(src));
4010   } else {
4011     lea(rscratch1, src);
4012     vsubsd(dst, nds, Address(rscratch1, 0));
4013   }
4014 }
4015 
4016 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4017   if (reachable(src)) {
4018     vsubss(dst, nds, as_Address(src));
4019   } else {
4020     lea(rscratch1, src);
4021     vsubss(dst, nds, Address(rscratch1, 0));
4022   }
4023 }
4024 
4025 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4026   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
4027   vxorps(dst, nds, src, Assembler::AVX_128bit);
4028 }
4029 
4030 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4031   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
4032   vxorpd(dst, nds, src, Assembler::AVX_128bit);
4033 }
4034 
4035 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
4036   if (reachable(src)) {
4037     vxorpd(dst, nds, as_Address(src), vector_len);
4038   } else {
4039     lea(scratch_reg, src);
4040     vxorpd(dst, nds, Address(scratch_reg, 0), vector_len);
4041   }
4042 }
4043 
4044 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
4045   if (reachable(src)) {
4046     vxorps(dst, nds, as_Address(src), vector_len);
4047   } else {
4048     lea(scratch_reg, src);
4049     vxorps(dst, nds, Address(scratch_reg, 0), vector_len);
4050   }
4051 }
4052 
4053 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
4054   if (UseAVX > 1 || (vector_len < 1)) {
4055     if (reachable(src)) {
4056       Assembler::vpxor(dst, nds, as_Address(src), vector_len);
4057     } else {
4058       lea(scratch_reg, src);
4059       Assembler::vpxor(dst, nds, Address(scratch_reg, 0), vector_len);
4060     }
4061   }
4062   else {
4063     MacroAssembler::vxorpd(dst, nds, src, vector_len, scratch_reg);
4064   }
4065 }
4066 
4067 //-------------------------------------------------------------------------------------------
4068 #ifdef COMPILER2
4069 // Generic instructions support for use in .ad files C2 code generation
4070 
4071 void MacroAssembler::vabsnegd(int opcode, XMMRegister dst, Register scr) {
4072   if (opcode == Op_AbsVD) {
4073     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
4074   } else {
4075     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
4076     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
4077   }
4078 }
4079 
4080 void MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
4081   if (opcode == Op_AbsVD) {
4082     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
4083   } else {
4084     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
4085     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
4086   }
4087 }
4088 
4089 void MacroAssembler::vabsnegf(int opcode, XMMRegister dst, Register scr) {
4090   if (opcode == Op_AbsVF) {
4091     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
4092   } else {
4093     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
4094     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
4095   }
4096 }
4097 
4098 void MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
4099   if (opcode == Op_AbsVF) {
4100     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
4101   } else {
4102     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
4103     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
4104   }
4105 }
4106 
4107 void MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
4108   if (sign) {
4109     pmovsxbw(dst, src);
4110   } else {
4111     pmovzxbw(dst, src);
4112   }
4113 }
4114 
4115 void MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
4116   if (sign) {
4117     vpmovsxbw(dst, src, vector_len);
4118   } else {
4119     vpmovzxbw(dst, src, vector_len);
4120   }
4121 }
4122 
4123 void MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src) {
4124   if (opcode == Op_RShiftVI) {
4125     psrad(dst, src);
4126   } else if (opcode == Op_LShiftVI) {
4127     pslld(dst, src);
4128   } else {
4129     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
4130     psrld(dst, src);
4131   }
4132 }
4133 
4134 void MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4135   if (opcode == Op_RShiftVI) {
4136     vpsrad(dst, nds, src, vector_len);
4137   } else if (opcode == Op_LShiftVI) {
4138     vpslld(dst, nds, src, vector_len);
4139   } else {
4140     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
4141     vpsrld(dst, nds, src, vector_len);
4142   }
4143 }
4144 
4145 void MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src) {
4146   if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) {
4147     psraw(dst, src);
4148   } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) {
4149     psllw(dst, src);
4150   } else {
4151     assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB");
4152     psrlw(dst, src);
4153   }
4154 }
4155 
4156 void MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4157   if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) {
4158     vpsraw(dst, nds, src, vector_len);
4159   } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) {
4160     vpsllw(dst, nds, src, vector_len);
4161   } else {
4162     assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB");
4163     vpsrlw(dst, nds, src, vector_len);
4164   }
4165 }
4166 
4167 void MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src) {
4168   if (opcode == Op_RShiftVL) {
4169     psrlq(dst, src);  // using srl to implement sra on pre-avs512 systems
4170   } else if (opcode == Op_LShiftVL) {
4171     psllq(dst, src);
4172   } else {
4173     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
4174     psrlq(dst, src);
4175   }
4176 }
4177 
4178 void MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4179   if (opcode == Op_RShiftVL) {
4180     evpsraq(dst, nds, src, vector_len);
4181   } else if (opcode == Op_LShiftVL) {
4182     vpsllq(dst, nds, src, vector_len);
4183   } else {
4184     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
4185     vpsrlq(dst, nds, src, vector_len);
4186   }
4187 }
4188 #endif
4189 //-------------------------------------------------------------------------------------------
4190 
4191 void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
4192   const int32_t inverted_jweak_mask = ~static_cast<int32_t>(JNIHandles::weak_tag_mask);
4193   STATIC_ASSERT(inverted_jweak_mask == -2); // otherwise check this code
4194   // The inverted mask is sign-extended
4195   andptr(possibly_jweak, inverted_jweak_mask);
4196 }
4197 
4198 void MacroAssembler::resolve_jobject(Register value,
4199                                      Register thread,
4200                                      Register tmp) {
4201   assert_different_registers(value, thread, tmp);
4202   Label done, not_weak;
4203   testptr(value, value);
4204   jcc(Assembler::zero, done);                // Use NULL as-is.
4205   testptr(value, JNIHandles::weak_tag_mask); // Test for jweak tag.
4206   jcc(Assembler::zero, not_weak);
4207   // Resolve jweak.
4208   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
4209                  value, Address(value, -JNIHandles::weak_tag_value), tmp, thread);
4210   verify_oop(value);
4211   jmp(done);
4212   bind(not_weak);
4213   // Resolve (untagged) jobject.
4214   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
4215   verify_oop(value);
4216   bind(done);
4217 }
4218 
4219 void MacroAssembler::subptr(Register dst, int32_t imm32) {
4220   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
4221 }
4222 
4223 // Force generation of a 4 byte immediate value even if it fits into 8bit
4224 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
4225   LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
4226 }
4227 
4228 void MacroAssembler::subptr(Register dst, Register src) {
4229   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
4230 }
4231 
4232 // C++ bool manipulation
4233 void MacroAssembler::testbool(Register dst) {
4234   if(sizeof(bool) == 1)
4235     testb(dst, 0xff);
4236   else if(sizeof(bool) == 2) {
4237     // testw implementation needed for two byte bools
4238     ShouldNotReachHere();
4239   } else if(sizeof(bool) == 4)
4240     testl(dst, dst);
4241   else
4242     // unsupported
4243     ShouldNotReachHere();
4244 }
4245 
4246 void MacroAssembler::testptr(Register dst, Register src) {
4247   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
4248 }
4249 
4250 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4251 void MacroAssembler::tlab_allocate(Register thread, Register obj,
4252                                    Register var_size_in_bytes,
4253                                    int con_size_in_bytes,
4254                                    Register t1,
4255                                    Register t2,
4256                                    Label& slow_case) {
4257   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4258   bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4259 }
4260 
4261 // Defines obj, preserves var_size_in_bytes
4262 void MacroAssembler::eden_allocate(Register thread, Register obj,
4263                                    Register var_size_in_bytes,
4264                                    int con_size_in_bytes,
4265                                    Register t1,
4266                                    Label& slow_case) {
4267   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4268   bs->eden_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4269 }
4270 
4271 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
4272 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
4273   assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
4274   assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
4275   Label done;
4276 
4277   testptr(length_in_bytes, length_in_bytes);
4278   jcc(Assembler::zero, done);
4279 
4280   // initialize topmost word, divide index by 2, check if odd and test if zero
4281   // note: for the remaining code to work, index must be a multiple of BytesPerWord
4282 #ifdef ASSERT
4283   {
4284     Label L;
4285     testptr(length_in_bytes, BytesPerWord - 1);
4286     jcc(Assembler::zero, L);
4287     stop("length must be a multiple of BytesPerWord");
4288     bind(L);
4289   }
4290 #endif
4291   Register index = length_in_bytes;
4292   xorptr(temp, temp);    // use _zero reg to clear memory (shorter code)
4293   if (UseIncDec) {
4294     shrptr(index, 3);  // divide by 8/16 and set carry flag if bit 2 was set
4295   } else {
4296     shrptr(index, 2);  // use 2 instructions to avoid partial flag stall
4297     shrptr(index, 1);
4298   }
4299 #ifndef _LP64
4300   // index could have not been a multiple of 8 (i.e., bit 2 was set)
4301   {
4302     Label even;
4303     // note: if index was a multiple of 8, then it cannot
4304     //       be 0 now otherwise it must have been 0 before
4305     //       => if it is even, we don't need to check for 0 again
4306     jcc(Assembler::carryClear, even);
4307     // clear topmost word (no jump would be needed if conditional assignment worked here)
4308     movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
4309     // index could be 0 now, must check again
4310     jcc(Assembler::zero, done);
4311     bind(even);
4312   }
4313 #endif // !_LP64
4314   // initialize remaining object fields: index is a multiple of 2 now
4315   {
4316     Label loop;
4317     bind(loop);
4318     movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
4319     NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
4320     decrement(index);
4321     jcc(Assembler::notZero, loop);
4322   }
4323 
4324   bind(done);
4325 }
4326 
4327 // Look up the method for a megamorphic invokeinterface call.
4328 // The target method is determined by <intf_klass, itable_index>.
4329 // The receiver klass is in recv_klass.
4330 // On success, the result will be in method_result, and execution falls through.
4331 // On failure, execution transfers to the given label.
4332 void MacroAssembler::lookup_interface_method(Register recv_klass,
4333                                              Register intf_klass,
4334                                              RegisterOrConstant itable_index,
4335                                              Register method_result,
4336                                              Register scan_temp,
4337                                              Label& L_no_such_interface,
4338                                              bool return_method) {
4339   assert_different_registers(recv_klass, intf_klass, scan_temp);
4340   assert_different_registers(method_result, intf_klass, scan_temp);
4341   assert(recv_klass != method_result || !return_method,
4342          "recv_klass can be destroyed when method isn't needed");
4343 
4344   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
4345          "caller must use same register for non-constant itable index as for method");
4346 
4347   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
4348   int vtable_base = in_bytes(Klass::vtable_start_offset());
4349   int itentry_off = itableMethodEntry::method_offset_in_bytes();
4350   int scan_step   = itableOffsetEntry::size() * wordSize;
4351   int vte_size    = vtableEntry::size_in_bytes();
4352   Address::ScaleFactor times_vte_scale = Address::times_ptr;
4353   assert(vte_size == wordSize, "else adjust times_vte_scale");
4354 
4355   movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
4356 
4357   // %%% Could store the aligned, prescaled offset in the klassoop.
4358   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
4359 
4360   if (return_method) {
4361     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
4362     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
4363     lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
4364   }
4365 
4366   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
4367   //   if (scan->interface() == intf) {
4368   //     result = (klass + scan->offset() + itable_index);
4369   //   }
4370   // }
4371   Label search, found_method;
4372 
4373   for (int peel = 1; peel >= 0; peel--) {
4374     movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
4375     cmpptr(intf_klass, method_result);
4376 
4377     if (peel) {
4378       jccb(Assembler::equal, found_method);
4379     } else {
4380       jccb(Assembler::notEqual, search);
4381       // (invert the test to fall through to found_method...)
4382     }
4383 
4384     if (!peel)  break;
4385 
4386     bind(search);
4387 
4388     // Check that the previous entry is non-null.  A null entry means that
4389     // the receiver class doesn't implement the interface, and wasn't the
4390     // same as when the caller was compiled.
4391     testptr(method_result, method_result);
4392     jcc(Assembler::zero, L_no_such_interface);
4393     addptr(scan_temp, scan_step);
4394   }
4395 
4396   bind(found_method);
4397 
4398   if (return_method) {
4399     // Got a hit.
4400     movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
4401     movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
4402   }
4403 }
4404 
4405 
4406 // virtual method calling
4407 void MacroAssembler::lookup_virtual_method(Register recv_klass,
4408                                            RegisterOrConstant vtable_index,
4409                                            Register method_result) {
4410   const int base = in_bytes(Klass::vtable_start_offset());
4411   assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
4412   Address vtable_entry_addr(recv_klass,
4413                             vtable_index, Address::times_ptr,
4414                             base + vtableEntry::method_offset_in_bytes());
4415   movptr(method_result, vtable_entry_addr);
4416 }
4417 
4418 
4419 void MacroAssembler::check_klass_subtype(Register sub_klass,
4420                            Register super_klass,
4421                            Register temp_reg,
4422                            Label& L_success) {
4423   Label L_failure;
4424   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
4425   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
4426   bind(L_failure);
4427 }
4428 
4429 
4430 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4431                                                    Register super_klass,
4432                                                    Register temp_reg,
4433                                                    Label* L_success,
4434                                                    Label* L_failure,
4435                                                    Label* L_slow_path,
4436                                         RegisterOrConstant super_check_offset) {
4437   assert_different_registers(sub_klass, super_klass, temp_reg);
4438   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
4439   if (super_check_offset.is_register()) {
4440     assert_different_registers(sub_klass, super_klass,
4441                                super_check_offset.as_register());
4442   } else if (must_load_sco) {
4443     assert(temp_reg != noreg, "supply either a temp or a register offset");
4444   }
4445 
4446   Label L_fallthrough;
4447   int label_nulls = 0;
4448   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
4449   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
4450   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
4451   assert(label_nulls <= 1, "at most one NULL in the batch");
4452 
4453   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4454   int sco_offset = in_bytes(Klass::super_check_offset_offset());
4455   Address super_check_offset_addr(super_klass, sco_offset);
4456 
4457   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
4458   // range of a jccb.  If this routine grows larger, reconsider at
4459   // least some of these.
4460 #define local_jcc(assembler_cond, label)                                \
4461   if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
4462   else                             jcc( assembler_cond, label) /*omit semi*/
4463 
4464   // Hacked jmp, which may only be used just before L_fallthrough.
4465 #define final_jmp(label)                                                \
4466   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
4467   else                            jmp(label)                /*omit semi*/
4468 
4469   // If the pointers are equal, we are done (e.g., String[] elements).
4470   // This self-check enables sharing of secondary supertype arrays among
4471   // non-primary types such as array-of-interface.  Otherwise, each such
4472   // type would need its own customized SSA.
4473   // We move this check to the front of the fast path because many
4474   // type checks are in fact trivially successful in this manner,
4475   // so we get a nicely predicted branch right at the start of the check.
4476   cmpptr(sub_klass, super_klass);
4477   local_jcc(Assembler::equal, *L_success);
4478 
4479   // Check the supertype display:
4480   if (must_load_sco) {
4481     // Positive movl does right thing on LP64.
4482     movl(temp_reg, super_check_offset_addr);
4483     super_check_offset = RegisterOrConstant(temp_reg);
4484   }
4485   Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
4486   cmpptr(super_klass, super_check_addr); // load displayed supertype
4487 
4488   // This check has worked decisively for primary supers.
4489   // Secondary supers are sought in the super_cache ('super_cache_addr').
4490   // (Secondary supers are interfaces and very deeply nested subtypes.)
4491   // This works in the same check above because of a tricky aliasing
4492   // between the super_cache and the primary super display elements.
4493   // (The 'super_check_addr' can address either, as the case requires.)
4494   // Note that the cache is updated below if it does not help us find
4495   // what we need immediately.
4496   // So if it was a primary super, we can just fail immediately.
4497   // Otherwise, it's the slow path for us (no success at this point).
4498 
4499   if (super_check_offset.is_register()) {
4500     local_jcc(Assembler::equal, *L_success);
4501     cmpl(super_check_offset.as_register(), sc_offset);
4502     if (L_failure == &L_fallthrough) {
4503       local_jcc(Assembler::equal, *L_slow_path);
4504     } else {
4505       local_jcc(Assembler::notEqual, *L_failure);
4506       final_jmp(*L_slow_path);
4507     }
4508   } else if (super_check_offset.as_constant() == sc_offset) {
4509     // Need a slow path; fast failure is impossible.
4510     if (L_slow_path == &L_fallthrough) {
4511       local_jcc(Assembler::equal, *L_success);
4512     } else {
4513       local_jcc(Assembler::notEqual, *L_slow_path);
4514       final_jmp(*L_success);
4515     }
4516   } else {
4517     // No slow path; it's a fast decision.
4518     if (L_failure == &L_fallthrough) {
4519       local_jcc(Assembler::equal, *L_success);
4520     } else {
4521       local_jcc(Assembler::notEqual, *L_failure);
4522       final_jmp(*L_success);
4523     }
4524   }
4525 
4526   bind(L_fallthrough);
4527 
4528 #undef local_jcc
4529 #undef final_jmp
4530 }
4531 
4532 
4533 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4534                                                    Register super_klass,
4535                                                    Register temp_reg,
4536                                                    Register temp2_reg,
4537                                                    Label* L_success,
4538                                                    Label* L_failure,
4539                                                    bool set_cond_codes) {
4540   assert_different_registers(sub_klass, super_klass, temp_reg);
4541   if (temp2_reg != noreg)
4542     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
4543 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
4544 
4545   Label L_fallthrough;
4546   int label_nulls = 0;
4547   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
4548   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
4549   assert(label_nulls <= 1, "at most one NULL in the batch");
4550 
4551   // a couple of useful fields in sub_klass:
4552   int ss_offset = in_bytes(Klass::secondary_supers_offset());
4553   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4554   Address secondary_supers_addr(sub_klass, ss_offset);
4555   Address super_cache_addr(     sub_klass, sc_offset);
4556 
4557   // Do a linear scan of the secondary super-klass chain.
4558   // This code is rarely used, so simplicity is a virtue here.
4559   // The repne_scan instruction uses fixed registers, which we must spill.
4560   // Don't worry too much about pre-existing connections with the input regs.
4561 
4562   assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
4563   assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
4564 
4565   // Get super_klass value into rax (even if it was in rdi or rcx).
4566   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
4567   if (super_klass != rax || UseCompressedOops) {
4568     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
4569     mov(rax, super_klass);
4570   }
4571   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
4572   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
4573 
4574 #ifndef PRODUCT
4575   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
4576   ExternalAddress pst_counter_addr((address) pst_counter);
4577   NOT_LP64(  incrementl(pst_counter_addr) );
4578   LP64_ONLY( lea(rcx, pst_counter_addr) );
4579   LP64_ONLY( incrementl(Address(rcx, 0)) );
4580 #endif //PRODUCT
4581 
4582   // We will consult the secondary-super array.
4583   movptr(rdi, secondary_supers_addr);
4584   // Load the array length.  (Positive movl does right thing on LP64.)
4585   movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
4586   // Skip to start of data.
4587   addptr(rdi, Array<Klass*>::base_offset_in_bytes());
4588 
4589   // Scan RCX words at [RDI] for an occurrence of RAX.
4590   // Set NZ/Z based on last compare.
4591   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
4592   // not change flags (only scas instruction which is repeated sets flags).
4593   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
4594 
4595     testptr(rax,rax); // Set Z = 0
4596     repne_scan();
4597 
4598   // Unspill the temp. registers:
4599   if (pushed_rdi)  pop(rdi);
4600   if (pushed_rcx)  pop(rcx);
4601   if (pushed_rax)  pop(rax);
4602 
4603   if (set_cond_codes) {
4604     // Special hack for the AD files:  rdi is guaranteed non-zero.
4605     assert(!pushed_rdi, "rdi must be left non-NULL");
4606     // Also, the condition codes are properly set Z/NZ on succeed/failure.
4607   }
4608 
4609   if (L_failure == &L_fallthrough)
4610         jccb(Assembler::notEqual, *L_failure);
4611   else  jcc(Assembler::notEqual, *L_failure);
4612 
4613   // Success.  Cache the super we found and proceed in triumph.
4614   movptr(super_cache_addr, super_klass);
4615 
4616   if (L_success != &L_fallthrough) {
4617     jmp(*L_success);
4618   }
4619 
4620 #undef IS_A_TEMP
4621 
4622   bind(L_fallthrough);
4623 }
4624 
4625 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
4626   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
4627 
4628   Label L_fallthrough;
4629   if (L_fast_path == NULL) {
4630     L_fast_path = &L_fallthrough;
4631   } else if (L_slow_path == NULL) {
4632     L_slow_path = &L_fallthrough;
4633   }
4634 
4635   // Fast path check: class is fully initialized
4636   cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
4637   jcc(Assembler::equal, *L_fast_path);
4638 
4639   // Fast path check: current thread is initializer thread
4640   cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset()));
4641   if (L_slow_path == &L_fallthrough) {
4642     jcc(Assembler::equal, *L_fast_path);
4643     bind(*L_slow_path);
4644   } else if (L_fast_path == &L_fallthrough) {
4645     jcc(Assembler::notEqual, *L_slow_path);
4646     bind(*L_fast_path);
4647   } else {
4648     Unimplemented();
4649   }
4650 }
4651 
4652 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
4653   if (VM_Version::supports_cmov()) {
4654     cmovl(cc, dst, src);
4655   } else {
4656     Label L;
4657     jccb(negate_condition(cc), L);
4658     movl(dst, src);
4659     bind(L);
4660   }
4661 }
4662 
4663 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
4664   if (VM_Version::supports_cmov()) {
4665     cmovl(cc, dst, src);
4666   } else {
4667     Label L;
4668     jccb(negate_condition(cc), L);
4669     movl(dst, src);
4670     bind(L);
4671   }
4672 }
4673 
4674 void MacroAssembler::verify_oop(Register reg, const char* s) {
4675   if (!VerifyOops) return;
4676 
4677   // Pass register number to verify_oop_subroutine
4678   const char* b = NULL;
4679   {
4680     ResourceMark rm;
4681     stringStream ss;
4682     ss.print("verify_oop: %s: %s", reg->name(), s);
4683     b = code_string(ss.as_string());
4684   }
4685   BLOCK_COMMENT("verify_oop {");
4686 #ifdef _LP64
4687   push(rscratch1);                    // save r10, trashed by movptr()
4688 #endif
4689   push(rax);                          // save rax,
4690   push(reg);                          // pass register argument
4691   ExternalAddress buffer((address) b);
4692   // avoid using pushptr, as it modifies scratch registers
4693   // and our contract is not to modify anything
4694   movptr(rax, buffer.addr());
4695   push(rax);
4696   // call indirectly to solve generation ordering problem
4697   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4698   call(rax);
4699   // Caller pops the arguments (oop, message) and restores rax, r10
4700   BLOCK_COMMENT("} verify_oop");
4701 }
4702 
4703 
4704 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
4705                                                       Register tmp,
4706                                                       int offset) {
4707   intptr_t value = *delayed_value_addr;
4708   if (value != 0)
4709     return RegisterOrConstant(value + offset);
4710 
4711   // load indirectly to solve generation ordering problem
4712   movptr(tmp, ExternalAddress((address) delayed_value_addr));
4713 
4714 #ifdef ASSERT
4715   { Label L;
4716     testptr(tmp, tmp);
4717     if (WizardMode) {
4718       const char* buf = NULL;
4719       {
4720         ResourceMark rm;
4721         stringStream ss;
4722         ss.print("DelayedValue=" INTPTR_FORMAT, delayed_value_addr[1]);
4723         buf = code_string(ss.as_string());
4724       }
4725       jcc(Assembler::notZero, L);
4726       STOP(buf);
4727     } else {
4728       jccb(Assembler::notZero, L);
4729       hlt();
4730     }
4731     bind(L);
4732   }
4733 #endif
4734 
4735   if (offset != 0)
4736     addptr(tmp, offset);
4737 
4738   return RegisterOrConstant(tmp);
4739 }
4740 
4741 
4742 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
4743                                          int extra_slot_offset) {
4744   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
4745   int stackElementSize = Interpreter::stackElementSize;
4746   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
4747 #ifdef ASSERT
4748   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
4749   assert(offset1 - offset == stackElementSize, "correct arithmetic");
4750 #endif
4751   Register             scale_reg    = noreg;
4752   Address::ScaleFactor scale_factor = Address::no_scale;
4753   if (arg_slot.is_constant()) {
4754     offset += arg_slot.as_constant() * stackElementSize;
4755   } else {
4756     scale_reg    = arg_slot.as_register();
4757     scale_factor = Address::times(stackElementSize);
4758   }
4759   offset += wordSize;           // return PC is on stack
4760   return Address(rsp, scale_reg, scale_factor, offset);
4761 }
4762 
4763 
4764 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
4765   if (!VerifyOops) return;
4766 
4767   // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
4768   // Pass register number to verify_oop_subroutine
4769   const char* b = NULL;
4770   {
4771     ResourceMark rm;
4772     stringStream ss;
4773     ss.print("verify_oop_addr: %s", s);
4774     b = code_string(ss.as_string());
4775   }
4776 #ifdef _LP64
4777   push(rscratch1);                    // save r10, trashed by movptr()
4778 #endif
4779   push(rax);                          // save rax,
4780   // addr may contain rsp so we will have to adjust it based on the push
4781   // we just did (and on 64 bit we do two pushes)
4782   // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
4783   // stores rax into addr which is backwards of what was intended.
4784   if (addr.uses(rsp)) {
4785     lea(rax, addr);
4786     pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
4787   } else {
4788     pushptr(addr);
4789   }
4790 
4791   ExternalAddress buffer((address) b);
4792   // pass msg argument
4793   // avoid using pushptr, as it modifies scratch registers
4794   // and our contract is not to modify anything
4795   movptr(rax, buffer.addr());
4796   push(rax);
4797 
4798   // call indirectly to solve generation ordering problem
4799   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4800   call(rax);
4801   // Caller pops the arguments (addr, message) and restores rax, r10.
4802 }
4803 
4804 void MacroAssembler::verify_tlab() {
4805 #ifdef ASSERT
4806   if (UseTLAB && VerifyOops) {
4807     Label next, ok;
4808     Register t1 = rsi;
4809     Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
4810 
4811     push(t1);
4812     NOT_LP64(push(thread_reg));
4813     NOT_LP64(get_thread(thread_reg));
4814 
4815     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4816     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
4817     jcc(Assembler::aboveEqual, next);
4818     STOP("assert(top >= start)");
4819     should_not_reach_here();
4820 
4821     bind(next);
4822     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
4823     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4824     jcc(Assembler::aboveEqual, ok);
4825     STOP("assert(top <= end)");
4826     should_not_reach_here();
4827 
4828     bind(ok);
4829     NOT_LP64(pop(thread_reg));
4830     pop(t1);
4831   }
4832 #endif
4833 }
4834 
4835 class ControlWord {
4836  public:
4837   int32_t _value;
4838 
4839   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
4840   int  precision_control() const       { return  (_value >>  8) & 3      ; }
4841   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4842   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4843   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4844   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4845   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4846   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4847 
4848   void print() const {
4849     // rounding control
4850     const char* rc;
4851     switch (rounding_control()) {
4852       case 0: rc = "round near"; break;
4853       case 1: rc = "round down"; break;
4854       case 2: rc = "round up  "; break;
4855       case 3: rc = "chop      "; break;
4856     };
4857     // precision control
4858     const char* pc;
4859     switch (precision_control()) {
4860       case 0: pc = "24 bits "; break;
4861       case 1: pc = "reserved"; break;
4862       case 2: pc = "53 bits "; break;
4863       case 3: pc = "64 bits "; break;
4864     };
4865     // flags
4866     char f[9];
4867     f[0] = ' ';
4868     f[1] = ' ';
4869     f[2] = (precision   ()) ? 'P' : 'p';
4870     f[3] = (underflow   ()) ? 'U' : 'u';
4871     f[4] = (overflow    ()) ? 'O' : 'o';
4872     f[5] = (zero_divide ()) ? 'Z' : 'z';
4873     f[6] = (denormalized()) ? 'D' : 'd';
4874     f[7] = (invalid     ()) ? 'I' : 'i';
4875     f[8] = '\x0';
4876     // output
4877     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
4878   }
4879 
4880 };
4881 
4882 class StatusWord {
4883  public:
4884   int32_t _value;
4885 
4886   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
4887   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
4888   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
4889   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
4890   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
4891   int  top() const                     { return  (_value >> 11) & 7      ; }
4892   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
4893   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
4894   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4895   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4896   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4897   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4898   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4899   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4900 
4901   void print() const {
4902     // condition codes
4903     char c[5];
4904     c[0] = (C3()) ? '3' : '-';
4905     c[1] = (C2()) ? '2' : '-';
4906     c[2] = (C1()) ? '1' : '-';
4907     c[3] = (C0()) ? '0' : '-';
4908     c[4] = '\x0';
4909     // flags
4910     char f[9];
4911     f[0] = (error_status()) ? 'E' : '-';
4912     f[1] = (stack_fault ()) ? 'S' : '-';
4913     f[2] = (precision   ()) ? 'P' : '-';
4914     f[3] = (underflow   ()) ? 'U' : '-';
4915     f[4] = (overflow    ()) ? 'O' : '-';
4916     f[5] = (zero_divide ()) ? 'Z' : '-';
4917     f[6] = (denormalized()) ? 'D' : '-';
4918     f[7] = (invalid     ()) ? 'I' : '-';
4919     f[8] = '\x0';
4920     // output
4921     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
4922   }
4923 
4924 };
4925 
4926 class TagWord {
4927  public:
4928   int32_t _value;
4929 
4930   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
4931 
4932   void print() const {
4933     printf("%04x", _value & 0xFFFF);
4934   }
4935 
4936 };
4937 
4938 class FPU_Register {
4939  public:
4940   int32_t _m0;
4941   int32_t _m1;
4942   int16_t _ex;
4943 
4944   bool is_indefinite() const           {
4945     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
4946   }
4947 
4948   void print() const {
4949     char  sign = (_ex < 0) ? '-' : '+';
4950     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
4951     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
4952   };
4953 
4954 };
4955 
4956 class FPU_State {
4957  public:
4958   enum {
4959     register_size       = 10,
4960     number_of_registers =  8,
4961     register_mask       =  7
4962   };
4963 
4964   ControlWord  _control_word;
4965   StatusWord   _status_word;
4966   TagWord      _tag_word;
4967   int32_t      _error_offset;
4968   int32_t      _error_selector;
4969   int32_t      _data_offset;
4970   int32_t      _data_selector;
4971   int8_t       _register[register_size * number_of_registers];
4972 
4973   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
4974   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
4975 
4976   const char* tag_as_string(int tag) const {
4977     switch (tag) {
4978       case 0: return "valid";
4979       case 1: return "zero";
4980       case 2: return "special";
4981       case 3: return "empty";
4982     }
4983     ShouldNotReachHere();
4984     return NULL;
4985   }
4986 
4987   void print() const {
4988     // print computation registers
4989     { int t = _status_word.top();
4990       for (int i = 0; i < number_of_registers; i++) {
4991         int j = (i - t) & register_mask;
4992         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
4993         st(j)->print();
4994         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
4995       }
4996     }
4997     printf("\n");
4998     // print control registers
4999     printf("ctrl = "); _control_word.print(); printf("\n");
5000     printf("stat = "); _status_word .print(); printf("\n");
5001     printf("tags = "); _tag_word    .print(); printf("\n");
5002   }
5003 
5004 };
5005 
5006 class Flag_Register {
5007  public:
5008   int32_t _value;
5009 
5010   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
5011   bool direction() const               { return ((_value >> 10) & 1) != 0; }
5012   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
5013   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
5014   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
5015   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
5016   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
5017 
5018   void print() const {
5019     // flags
5020     char f[8];
5021     f[0] = (overflow       ()) ? 'O' : '-';
5022     f[1] = (direction      ()) ? 'D' : '-';
5023     f[2] = (sign           ()) ? 'S' : '-';
5024     f[3] = (zero           ()) ? 'Z' : '-';
5025     f[4] = (auxiliary_carry()) ? 'A' : '-';
5026     f[5] = (parity         ()) ? 'P' : '-';
5027     f[6] = (carry          ()) ? 'C' : '-';
5028     f[7] = '\x0';
5029     // output
5030     printf("%08x  flags = %s", _value, f);
5031   }
5032 
5033 };
5034 
5035 class IU_Register {
5036  public:
5037   int32_t _value;
5038 
5039   void print() const {
5040     printf("%08x  %11d", _value, _value);
5041   }
5042 
5043 };
5044 
5045 class IU_State {
5046  public:
5047   Flag_Register _eflags;
5048   IU_Register   _rdi;
5049   IU_Register   _rsi;
5050   IU_Register   _rbp;
5051   IU_Register   _rsp;
5052   IU_Register   _rbx;
5053   IU_Register   _rdx;
5054   IU_Register   _rcx;
5055   IU_Register   _rax;
5056 
5057   void print() const {
5058     // computation registers
5059     printf("rax,  = "); _rax.print(); printf("\n");
5060     printf("rbx,  = "); _rbx.print(); printf("\n");
5061     printf("rcx  = "); _rcx.print(); printf("\n");
5062     printf("rdx  = "); _rdx.print(); printf("\n");
5063     printf("rdi  = "); _rdi.print(); printf("\n");
5064     printf("rsi  = "); _rsi.print(); printf("\n");
5065     printf("rbp,  = "); _rbp.print(); printf("\n");
5066     printf("rsp  = "); _rsp.print(); printf("\n");
5067     printf("\n");
5068     // control registers
5069     printf("flgs = "); _eflags.print(); printf("\n");
5070   }
5071 };
5072 
5073 
5074 class CPU_State {
5075  public:
5076   FPU_State _fpu_state;
5077   IU_State  _iu_state;
5078 
5079   void print() const {
5080     printf("--------------------------------------------------\n");
5081     _iu_state .print();
5082     printf("\n");
5083     _fpu_state.print();
5084     printf("--------------------------------------------------\n");
5085   }
5086 
5087 };
5088 
5089 
5090 static void _print_CPU_state(CPU_State* state) {
5091   state->print();
5092 };
5093 
5094 
5095 void MacroAssembler::print_CPU_state() {
5096   push_CPU_state();
5097   push(rsp);                // pass CPU state
5098   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
5099   addptr(rsp, wordSize);       // discard argument
5100   pop_CPU_state();
5101 }
5102 
5103 
5104 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
5105   static int counter = 0;
5106   FPU_State* fs = &state->_fpu_state;
5107   counter++;
5108   // For leaf calls, only verify that the top few elements remain empty.
5109   // We only need 1 empty at the top for C2 code.
5110   if( stack_depth < 0 ) {
5111     if( fs->tag_for_st(7) != 3 ) {
5112       printf("FPR7 not empty\n");
5113       state->print();
5114       assert(false, "error");
5115       return false;
5116     }
5117     return true;                // All other stack states do not matter
5118   }
5119 
5120   assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
5121          "bad FPU control word");
5122 
5123   // compute stack depth
5124   int i = 0;
5125   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
5126   int d = i;
5127   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
5128   // verify findings
5129   if (i != FPU_State::number_of_registers) {
5130     // stack not contiguous
5131     printf("%s: stack not contiguous at ST%d\n", s, i);
5132     state->print();
5133     assert(false, "error");
5134     return false;
5135   }
5136   // check if computed stack depth corresponds to expected stack depth
5137   if (stack_depth < 0) {
5138     // expected stack depth is -stack_depth or less
5139     if (d > -stack_depth) {
5140       // too many elements on the stack
5141       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
5142       state->print();
5143       assert(false, "error");
5144       return false;
5145     }
5146   } else {
5147     // expected stack depth is stack_depth
5148     if (d != stack_depth) {
5149       // wrong stack depth
5150       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
5151       state->print();
5152       assert(false, "error");
5153       return false;
5154     }
5155   }
5156   // everything is cool
5157   return true;
5158 }
5159 
5160 
5161 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
5162   if (!VerifyFPU) return;
5163   push_CPU_state();
5164   push(rsp);                // pass CPU state
5165   ExternalAddress msg((address) s);
5166   // pass message string s
5167   pushptr(msg.addr());
5168   push(stack_depth);        // pass stack depth
5169   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
5170   addptr(rsp, 3 * wordSize);   // discard arguments
5171   // check for error
5172   { Label L;
5173     testl(rax, rax);
5174     jcc(Assembler::notZero, L);
5175     int3();                  // break if error condition
5176     bind(L);
5177   }
5178   pop_CPU_state();
5179 }
5180 
5181 void MacroAssembler::restore_cpu_control_state_after_jni() {
5182   // Either restore the MXCSR register after returning from the JNI Call
5183   // or verify that it wasn't changed (with -Xcheck:jni flag).
5184   if (VM_Version::supports_sse()) {
5185     if (RestoreMXCSROnJNICalls) {
5186       ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
5187     } else if (CheckJNICalls) {
5188       call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
5189     }
5190   }
5191   // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
5192   vzeroupper();
5193   // Reset k1 to 0xffff.
5194 
5195 #ifdef COMPILER2
5196   if (PostLoopMultiversioning && VM_Version::supports_evex()) {
5197     push(rcx);
5198     movl(rcx, 0xffff);
5199     kmovwl(k1, rcx);
5200     pop(rcx);
5201   }
5202 #endif // COMPILER2
5203 
5204 #ifndef _LP64
5205   // Either restore the x87 floating pointer control word after returning
5206   // from the JNI call or verify that it wasn't changed.
5207   if (CheckJNICalls) {
5208     call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
5209   }
5210 #endif // _LP64
5211 }
5212 
5213 // ((OopHandle)result).resolve();
5214 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
5215   assert_different_registers(result, tmp);
5216 
5217   // Only 64 bit platforms support GCs that require a tmp register
5218   // Only IN_HEAP loads require a thread_tmp register
5219   // OopHandle::resolve is an indirection like jobject.
5220   access_load_at(T_OBJECT, IN_NATIVE,
5221                  result, Address(result, 0), tmp, /*tmp_thread*/noreg);
5222 }
5223 
5224 // ((WeakHandle)result).resolve();
5225 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
5226   assert_different_registers(rresult, rtmp);
5227   Label resolved;
5228 
5229   // A null weak handle resolves to null.
5230   cmpptr(rresult, 0);
5231   jcc(Assembler::equal, resolved);
5232 
5233   // Only 64 bit platforms support GCs that require a tmp register
5234   // Only IN_HEAP loads require a thread_tmp register
5235   // WeakHandle::resolve is an indirection like jweak.
5236   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
5237                  rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg);
5238   bind(resolved);
5239 }
5240 
5241 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
5242   // get mirror
5243   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
5244   load_method_holder(mirror, method);
5245   movptr(mirror, Address(mirror, mirror_offset));
5246   resolve_oop_handle(mirror, tmp);
5247 }
5248 
5249 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
5250   load_method_holder(rresult, rmethod);
5251   movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
5252 }
5253 
5254 void MacroAssembler::load_method_holder(Register holder, Register method) {
5255   movptr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
5256   movptr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
5257   movptr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
5258 }
5259 
5260 void MacroAssembler::load_klass(Register dst, Register src) {
5261 #ifdef _LP64
5262   if (UseCompressedClassPointers) {
5263     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5264     decode_klass_not_null(dst);
5265   } else
5266 #endif
5267     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5268 }
5269 
5270 void MacroAssembler::load_prototype_header(Register dst, Register src) {
5271   load_klass(dst, src);
5272   movptr(dst, Address(dst, Klass::prototype_header_offset()));
5273 }
5274 
5275 void MacroAssembler::store_klass(Register dst, Register src) {
5276 #ifdef _LP64
5277   if (UseCompressedClassPointers) {
5278     encode_klass_not_null(src);
5279     movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5280   } else
5281 #endif
5282     movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5283 }
5284 
5285 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
5286                                     Register tmp1, Register thread_tmp) {
5287   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5288   decorators = AccessInternal::decorator_fixup(decorators);
5289   bool as_raw = (decorators & AS_RAW) != 0;
5290   if (as_raw) {
5291     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
5292   } else {
5293     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
5294   }
5295 }
5296 
5297 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src,
5298                                      Register tmp1, Register tmp2) {
5299   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5300   decorators = AccessInternal::decorator_fixup(decorators);
5301   bool as_raw = (decorators & AS_RAW) != 0;
5302   if (as_raw) {
5303     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, tmp2);
5304   } else {
5305     bs->store_at(this, decorators, type, dst, src, tmp1, tmp2);
5306   }
5307 }
5308 
5309 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
5310   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
5311   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
5312     decorators |= ACCESS_READ | ACCESS_WRITE;
5313   }
5314   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5315   return bs->resolve(this, decorators, obj);
5316 }
5317 
5318 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
5319                                    Register thread_tmp, DecoratorSet decorators) {
5320   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
5321 }
5322 
5323 // Doesn't do verfication, generates fixed size code
5324 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
5325                                             Register thread_tmp, DecoratorSet decorators) {
5326   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
5327 }
5328 
5329 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
5330                                     Register tmp2, DecoratorSet decorators) {
5331   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
5332 }
5333 
5334 // Used for storing NULLs.
5335 void MacroAssembler::store_heap_oop_null(Address dst) {
5336   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
5337 }
5338 
5339 #ifdef _LP64
5340 void MacroAssembler::store_klass_gap(Register dst, Register src) {
5341   if (UseCompressedClassPointers) {
5342     // Store to klass gap in destination
5343     movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
5344   }
5345 }
5346 
5347 #ifdef ASSERT
5348 void MacroAssembler::verify_heapbase(const char* msg) {
5349   assert (UseCompressedOops, "should be compressed");
5350   assert (Universe::heap() != NULL, "java heap should be initialized");
5351   if (CheckCompressedOops) {
5352     Label ok;
5353     push(rscratch1); // cmpptr trashes rscratch1
5354     cmpptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
5355     jcc(Assembler::equal, ok);
5356     STOP(msg);
5357     bind(ok);
5358     pop(rscratch1);
5359   }
5360 }
5361 #endif
5362 
5363 // Algorithm must match oop.inline.hpp encode_heap_oop.
5364 void MacroAssembler::encode_heap_oop(Register r) {
5365 #ifdef ASSERT
5366   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
5367 #endif
5368   verify_oop(r, "broken oop in encode_heap_oop");
5369   if (CompressedOops::base() == NULL) {
5370     if (CompressedOops::shift() != 0) {
5371       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5372       shrq(r, LogMinObjAlignmentInBytes);
5373     }
5374     return;
5375   }
5376   testq(r, r);
5377   cmovq(Assembler::equal, r, r12_heapbase);
5378   subq(r, r12_heapbase);
5379   shrq(r, LogMinObjAlignmentInBytes);
5380 }
5381 
5382 void MacroAssembler::encode_heap_oop_not_null(Register r) {
5383 #ifdef ASSERT
5384   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
5385   if (CheckCompressedOops) {
5386     Label ok;
5387     testq(r, r);
5388     jcc(Assembler::notEqual, ok);
5389     STOP("null oop passed to encode_heap_oop_not_null");
5390     bind(ok);
5391   }
5392 #endif
5393   verify_oop(r, "broken oop in encode_heap_oop_not_null");
5394   if (CompressedOops::base() != NULL) {
5395     subq(r, r12_heapbase);
5396   }
5397   if (CompressedOops::shift() != 0) {
5398     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5399     shrq(r, LogMinObjAlignmentInBytes);
5400   }
5401 }
5402 
5403 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
5404 #ifdef ASSERT
5405   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
5406   if (CheckCompressedOops) {
5407     Label ok;
5408     testq(src, src);
5409     jcc(Assembler::notEqual, ok);
5410     STOP("null oop passed to encode_heap_oop_not_null2");
5411     bind(ok);
5412   }
5413 #endif
5414   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
5415   if (dst != src) {
5416     movq(dst, src);
5417   }
5418   if (CompressedOops::base() != NULL) {
5419     subq(dst, r12_heapbase);
5420   }
5421   if (CompressedOops::shift() != 0) {
5422     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5423     shrq(dst, LogMinObjAlignmentInBytes);
5424   }
5425 }
5426 
5427 void  MacroAssembler::decode_heap_oop(Register r) {
5428 #ifdef ASSERT
5429   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
5430 #endif
5431   if (CompressedOops::base() == NULL) {
5432     if (CompressedOops::shift() != 0) {
5433       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5434       shlq(r, LogMinObjAlignmentInBytes);
5435     }
5436   } else {
5437     Label done;
5438     shlq(r, LogMinObjAlignmentInBytes);
5439     jccb(Assembler::equal, done);
5440     addq(r, r12_heapbase);
5441     bind(done);
5442   }
5443   verify_oop(r, "broken oop in decode_heap_oop");
5444 }
5445 
5446 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
5447   // Note: it will change flags
5448   assert (UseCompressedOops, "should only be used for compressed headers");
5449   assert (Universe::heap() != NULL, "java heap should be initialized");
5450   // Cannot assert, unverified entry point counts instructions (see .ad file)
5451   // vtableStubs also counts instructions in pd_code_size_limit.
5452   // Also do not verify_oop as this is called by verify_oop.
5453   if (CompressedOops::shift() != 0) {
5454     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5455     shlq(r, LogMinObjAlignmentInBytes);
5456     if (CompressedOops::base() != NULL) {
5457       addq(r, r12_heapbase);
5458     }
5459   } else {
5460     assert (CompressedOops::base() == NULL, "sanity");
5461   }
5462 }
5463 
5464 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
5465   // Note: it will change flags
5466   assert (UseCompressedOops, "should only be used for compressed headers");
5467   assert (Universe::heap() != NULL, "java heap should be initialized");
5468   // Cannot assert, unverified entry point counts instructions (see .ad file)
5469   // vtableStubs also counts instructions in pd_code_size_limit.
5470   // Also do not verify_oop as this is called by verify_oop.
5471   if (CompressedOops::shift() != 0) {
5472     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5473     if (LogMinObjAlignmentInBytes == Address::times_8) {
5474       leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
5475     } else {
5476       if (dst != src) {
5477         movq(dst, src);
5478       }
5479       shlq(dst, LogMinObjAlignmentInBytes);
5480       if (CompressedOops::base() != NULL) {
5481         addq(dst, r12_heapbase);
5482       }
5483     }
5484   } else {
5485     assert (CompressedOops::base() == NULL, "sanity");
5486     if (dst != src) {
5487       movq(dst, src);
5488     }
5489   }
5490 }
5491 
5492 void MacroAssembler::encode_klass_not_null(Register r) {
5493   if (CompressedKlassPointers::base() != NULL) {
5494     // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
5495     assert(r != r12_heapbase, "Encoding a klass in r12");
5496     mov64(r12_heapbase, (int64_t)CompressedKlassPointers::base());
5497     subq(r, r12_heapbase);
5498   }
5499   if (CompressedKlassPointers::shift() != 0) {
5500     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5501     shrq(r, LogKlassAlignmentInBytes);
5502   }
5503   if (CompressedKlassPointers::base() != NULL) {
5504     reinit_heapbase();
5505   }
5506 }
5507 
5508 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
5509   if (dst == src) {
5510     encode_klass_not_null(src);
5511   } else {
5512     if (CompressedKlassPointers::base() != NULL) {
5513       mov64(dst, (int64_t)CompressedKlassPointers::base());
5514       negq(dst);
5515       addq(dst, src);
5516     } else {
5517       movptr(dst, src);
5518     }
5519     if (CompressedKlassPointers::shift() != 0) {
5520       assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5521       shrq(dst, LogKlassAlignmentInBytes);
5522     }
5523   }
5524 }
5525 
5526 // Function instr_size_for_decode_klass_not_null() counts the instructions
5527 // generated by decode_klass_not_null(register r) and reinit_heapbase(),
5528 // when (Universe::heap() != NULL).  Hence, if the instructions they
5529 // generate change, then this method needs to be updated.
5530 int MacroAssembler::instr_size_for_decode_klass_not_null() {
5531   assert (UseCompressedClassPointers, "only for compressed klass ptrs");
5532   if (CompressedKlassPointers::base() != NULL) {
5533     // mov64 + addq + shlq? + mov64  (for reinit_heapbase()).
5534     return (CompressedKlassPointers::shift() == 0 ? 20 : 24);
5535   } else {
5536     // longest load decode klass function, mov64, leaq
5537     return 16;
5538   }
5539 }
5540 
5541 // !!! If the instructions that get generated here change then function
5542 // instr_size_for_decode_klass_not_null() needs to get updated.
5543 void  MacroAssembler::decode_klass_not_null(Register r) {
5544   // Note: it will change flags
5545   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5546   assert(r != r12_heapbase, "Decoding a klass in r12");
5547   // Cannot assert, unverified entry point counts instructions (see .ad file)
5548   // vtableStubs also counts instructions in pd_code_size_limit.
5549   // Also do not verify_oop as this is called by verify_oop.
5550   if (CompressedKlassPointers::shift() != 0) {
5551     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5552     shlq(r, LogKlassAlignmentInBytes);
5553   }
5554   // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
5555   if (CompressedKlassPointers::base() != NULL) {
5556     mov64(r12_heapbase, (int64_t)CompressedKlassPointers::base());
5557     addq(r, r12_heapbase);
5558     reinit_heapbase();
5559   }
5560 }
5561 
5562 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
5563   // Note: it will change flags
5564   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5565   if (dst == src) {
5566     decode_klass_not_null(dst);
5567   } else {
5568     // Cannot assert, unverified entry point counts instructions (see .ad file)
5569     // vtableStubs also counts instructions in pd_code_size_limit.
5570     // Also do not verify_oop as this is called by verify_oop.
5571     mov64(dst, (int64_t)CompressedKlassPointers::base());
5572     if (CompressedKlassPointers::shift() != 0) {
5573       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5574       assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
5575       leaq(dst, Address(dst, src, Address::times_8, 0));
5576     } else {
5577       addq(dst, src);
5578     }
5579   }
5580 }
5581 
5582 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5583   assert (UseCompressedOops, "should only be used for compressed headers");
5584   assert (Universe::heap() != NULL, "java heap should be initialized");
5585   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5586   int oop_index = oop_recorder()->find_index(obj);
5587   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5588   mov_narrow_oop(dst, oop_index, rspec);
5589 }
5590 
5591 void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
5592   assert (UseCompressedOops, "should only be used for compressed headers");
5593   assert (Universe::heap() != NULL, "java heap should be initialized");
5594   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5595   int oop_index = oop_recorder()->find_index(obj);
5596   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5597   mov_narrow_oop(dst, oop_index, rspec);
5598 }
5599 
5600 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5601   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5602   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5603   int klass_index = oop_recorder()->find_index(k);
5604   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5605   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5606 }
5607 
5608 void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
5609   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5610   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5611   int klass_index = oop_recorder()->find_index(k);
5612   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5613   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5614 }
5615 
5616 void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
5617   assert (UseCompressedOops, "should only be used for compressed headers");
5618   assert (Universe::heap() != NULL, "java heap should be initialized");
5619   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5620   int oop_index = oop_recorder()->find_index(obj);
5621   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5622   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5623 }
5624 
5625 void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
5626   assert (UseCompressedOops, "should only be used for compressed headers");
5627   assert (Universe::heap() != NULL, "java heap should be initialized");
5628   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5629   int oop_index = oop_recorder()->find_index(obj);
5630   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5631   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5632 }
5633 
5634 void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
5635   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5636   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5637   int klass_index = oop_recorder()->find_index(k);
5638   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5639   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5640 }
5641 
5642 void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
5643   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5644   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5645   int klass_index = oop_recorder()->find_index(k);
5646   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5647   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5648 }
5649 
5650 void MacroAssembler::reinit_heapbase() {
5651   if (UseCompressedOops || UseCompressedClassPointers) {
5652     if (Universe::heap() != NULL) {
5653       if (CompressedOops::base() == NULL) {
5654         MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
5655       } else {
5656         mov64(r12_heapbase, (int64_t)CompressedOops::ptrs_base());
5657       }
5658     } else {
5659       movptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
5660     }
5661   }
5662 }
5663 
5664 #endif // _LP64
5665 
5666 // C2 compiled method's prolog code.
5667 void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
5668 
5669   // WARNING: Initial instruction MUST be 5 bytes or longer so that
5670   // NativeJump::patch_verified_entry will be able to patch out the entry
5671   // code safely. The push to verify stack depth is ok at 5 bytes,
5672   // the frame allocation can be either 3 or 6 bytes. So if we don't do
5673   // stack bang then we must use the 6 byte frame allocation even if
5674   // we have no frame. :-(
5675   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
5676 
5677   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
5678   // Remove word for return addr
5679   framesize -= wordSize;
5680   stack_bang_size -= wordSize;
5681 
5682   // Calls to C2R adapters often do not accept exceptional returns.
5683   // We require that their callers must bang for them.  But be careful, because
5684   // some VM calls (such as call site linkage) can use several kilobytes of
5685   // stack.  But the stack safety zone should account for that.
5686   // See bugs 4446381, 4468289, 4497237.
5687   if (stack_bang_size > 0) {
5688     generate_stack_overflow_check(stack_bang_size);
5689 
5690     // We always push rbp, so that on return to interpreter rbp, will be
5691     // restored correctly and we can correct the stack.
5692     push(rbp);
5693     // Save caller's stack pointer into RBP if the frame pointer is preserved.
5694     if (PreserveFramePointer) {
5695       mov(rbp, rsp);
5696     }
5697     // Remove word for ebp
5698     framesize -= wordSize;
5699 
5700     // Create frame
5701     if (framesize) {
5702       subptr(rsp, framesize);
5703     }
5704   } else {
5705     // Create frame (force generation of a 4 byte immediate value)
5706     subptr_imm32(rsp, framesize);
5707 
5708     // Save RBP register now.
5709     framesize -= wordSize;
5710     movptr(Address(rsp, framesize), rbp);
5711     // Save caller's stack pointer into RBP if the frame pointer is preserved.
5712     if (PreserveFramePointer) {
5713       movptr(rbp, rsp);
5714       if (framesize > 0) {
5715         addptr(rbp, framesize);
5716       }
5717     }
5718   }
5719 
5720   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
5721     framesize -= wordSize;
5722     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
5723   }
5724 
5725 #ifndef _LP64
5726   // If method sets FPU control word do it now
5727   if (fp_mode_24b) {
5728     fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
5729   }
5730   if (UseSSE >= 2 && VerifyFPU) {
5731     verify_FPU(0, "FPU stack must be clean on entry");
5732   }
5733 #endif
5734 
5735 #ifdef ASSERT
5736   if (VerifyStackAtCalls) {
5737     Label L;
5738     push(rax);
5739     mov(rax, rsp);
5740     andptr(rax, StackAlignmentInBytes-1);
5741     cmpptr(rax, StackAlignmentInBytes-wordSize);
5742     pop(rax);
5743     jcc(Assembler::equal, L);
5744     STOP("Stack is not properly aligned!");
5745     bind(L);
5746   }
5747 #endif
5748 
5749   if (!is_stub) {
5750     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5751     bs->nmethod_entry_barrier(this);
5752   }
5753 }
5754 
5755 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers
5756 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, XMMRegister xtmp) {
5757   // cnt - number of qwords (8-byte words).
5758   // base - start address, qword aligned.
5759   Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
5760   if (UseAVX >= 2) {
5761     vpxor(xtmp, xtmp, xtmp, AVX_256bit);
5762   } else {
5763     pxor(xtmp, xtmp);
5764   }
5765   jmp(L_zero_64_bytes);
5766 
5767   BIND(L_loop);
5768   if (UseAVX >= 2) {
5769     vmovdqu(Address(base,  0), xtmp);
5770     vmovdqu(Address(base, 32), xtmp);
5771   } else {
5772     movdqu(Address(base,  0), xtmp);
5773     movdqu(Address(base, 16), xtmp);
5774     movdqu(Address(base, 32), xtmp);
5775     movdqu(Address(base, 48), xtmp);
5776   }
5777   addptr(base, 64);
5778 
5779   BIND(L_zero_64_bytes);
5780   subptr(cnt, 8);
5781   jccb(Assembler::greaterEqual, L_loop);
5782   addptr(cnt, 4);
5783   jccb(Assembler::less, L_tail);
5784   // Copy trailing 32 bytes
5785   if (UseAVX >= 2) {
5786     vmovdqu(Address(base, 0), xtmp);
5787   } else {
5788     movdqu(Address(base,  0), xtmp);
5789     movdqu(Address(base, 16), xtmp);
5790   }
5791   addptr(base, 32);
5792   subptr(cnt, 4);
5793 
5794   BIND(L_tail);
5795   addptr(cnt, 4);
5796   jccb(Assembler::lessEqual, L_end);
5797   decrement(cnt);
5798 
5799   BIND(L_sloop);
5800   movq(Address(base, 0), xtmp);
5801   addptr(base, 8);
5802   decrement(cnt);
5803   jccb(Assembler::greaterEqual, L_sloop);
5804   BIND(L_end);
5805 }
5806 
5807 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp, bool is_large) {
5808   // cnt - number of qwords (8-byte words).
5809   // base - start address, qword aligned.
5810   // is_large - if optimizers know cnt is larger than InitArrayShortSize
5811   assert(base==rdi, "base register must be edi for rep stos");
5812   assert(tmp==rax,   "tmp register must be eax for rep stos");
5813   assert(cnt==rcx,   "cnt register must be ecx for rep stos");
5814   assert(InitArrayShortSize % BytesPerLong == 0,
5815     "InitArrayShortSize should be the multiple of BytesPerLong");
5816 
5817   Label DONE;
5818 
5819   if (!is_large || !UseXMMForObjInit) {
5820     xorptr(tmp, tmp);
5821   }
5822 
5823   if (!is_large) {
5824     Label LOOP, LONG;
5825     cmpptr(cnt, InitArrayShortSize/BytesPerLong);
5826     jccb(Assembler::greater, LONG);
5827 
5828     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5829 
5830     decrement(cnt);
5831     jccb(Assembler::negative, DONE); // Zero length
5832 
5833     // Use individual pointer-sized stores for small counts:
5834     BIND(LOOP);
5835     movptr(Address(base, cnt, Address::times_ptr), tmp);
5836     decrement(cnt);
5837     jccb(Assembler::greaterEqual, LOOP);
5838     jmpb(DONE);
5839 
5840     BIND(LONG);
5841   }
5842 
5843   // Use longer rep-prefixed ops for non-small counts:
5844   if (UseFastStosb) {
5845     shlptr(cnt, 3); // convert to number of bytes
5846     rep_stosb();
5847   } else if (UseXMMForObjInit) {
5848     movptr(tmp, base);
5849     xmm_clear_mem(tmp, cnt, xtmp);
5850   } else {
5851     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5852     rep_stos();
5853   }
5854 
5855   BIND(DONE);
5856 }
5857 
5858 #ifdef COMPILER2
5859 
5860 // IndexOf for constant substrings with size >= 8 chars
5861 // which don't need to be loaded through stack.
5862 void MacroAssembler::string_indexofC8(Register str1, Register str2,
5863                                       Register cnt1, Register cnt2,
5864                                       int int_cnt2,  Register result,
5865                                       XMMRegister vec, Register tmp,
5866                                       int ae) {
5867   ShortBranchVerifier sbv(this);
5868   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
5869   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
5870 
5871   // This method uses the pcmpestri instruction with bound registers
5872   //   inputs:
5873   //     xmm - substring
5874   //     rax - substring length (elements count)
5875   //     mem - scanned string
5876   //     rdx - string length (elements count)
5877   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
5878   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
5879   //   outputs:
5880   //     rcx - matched index in string
5881   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
5882   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
5883   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
5884   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
5885   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
5886 
5887   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
5888         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
5889         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
5890 
5891   // Note, inline_string_indexOf() generates checks:
5892   // if (substr.count > string.count) return -1;
5893   // if (substr.count == 0) return 0;
5894   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
5895 
5896   // Load substring.
5897   if (ae == StrIntrinsicNode::UL) {
5898     pmovzxbw(vec, Address(str2, 0));
5899   } else {
5900     movdqu(vec, Address(str2, 0));
5901   }
5902   movl(cnt2, int_cnt2);
5903   movptr(result, str1); // string addr
5904 
5905   if (int_cnt2 > stride) {
5906     jmpb(SCAN_TO_SUBSTR);
5907 
5908     // Reload substr for rescan, this code
5909     // is executed only for large substrings (> 8 chars)
5910     bind(RELOAD_SUBSTR);
5911     if (ae == StrIntrinsicNode::UL) {
5912       pmovzxbw(vec, Address(str2, 0));
5913     } else {
5914       movdqu(vec, Address(str2, 0));
5915     }
5916     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
5917 
5918     bind(RELOAD_STR);
5919     // We came here after the beginning of the substring was
5920     // matched but the rest of it was not so we need to search
5921     // again. Start from the next element after the previous match.
5922 
5923     // cnt2 is number of substring reminding elements and
5924     // cnt1 is number of string reminding elements when cmp failed.
5925     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
5926     subl(cnt1, cnt2);
5927     addl(cnt1, int_cnt2);
5928     movl(cnt2, int_cnt2); // Now restore cnt2
5929 
5930     decrementl(cnt1);     // Shift to next element
5931     cmpl(cnt1, cnt2);
5932     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
5933 
5934     addptr(result, (1<<scale1));
5935 
5936   } // (int_cnt2 > 8)
5937 
5938   // Scan string for start of substr in 16-byte vectors
5939   bind(SCAN_TO_SUBSTR);
5940   pcmpestri(vec, Address(result, 0), mode);
5941   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
5942   subl(cnt1, stride);
5943   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
5944   cmpl(cnt1, cnt2);
5945   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
5946   addptr(result, 16);
5947   jmpb(SCAN_TO_SUBSTR);
5948 
5949   // Found a potential substr
5950   bind(FOUND_CANDIDATE);
5951   // Matched whole vector if first element matched (tmp(rcx) == 0).
5952   if (int_cnt2 == stride) {
5953     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
5954   } else { // int_cnt2 > 8
5955     jccb(Assembler::overflow, FOUND_SUBSTR);
5956   }
5957   // After pcmpestri tmp(rcx) contains matched element index
5958   // Compute start addr of substr
5959   lea(result, Address(result, tmp, scale1));
5960 
5961   // Make sure string is still long enough
5962   subl(cnt1, tmp);
5963   cmpl(cnt1, cnt2);
5964   if (int_cnt2 == stride) {
5965     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
5966   } else { // int_cnt2 > 8
5967     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
5968   }
5969   // Left less then substring.
5970 
5971   bind(RET_NOT_FOUND);
5972   movl(result, -1);
5973   jmp(EXIT);
5974 
5975   if (int_cnt2 > stride) {
5976     // This code is optimized for the case when whole substring
5977     // is matched if its head is matched.
5978     bind(MATCH_SUBSTR_HEAD);
5979     pcmpestri(vec, Address(result, 0), mode);
5980     // Reload only string if does not match
5981     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
5982 
5983     Label CONT_SCAN_SUBSTR;
5984     // Compare the rest of substring (> 8 chars).
5985     bind(FOUND_SUBSTR);
5986     // First 8 chars are already matched.
5987     negptr(cnt2);
5988     addptr(cnt2, stride);
5989 
5990     bind(SCAN_SUBSTR);
5991     subl(cnt1, stride);
5992     cmpl(cnt2, -stride); // Do not read beyond substring
5993     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
5994     // Back-up strings to avoid reading beyond substring:
5995     // cnt1 = cnt1 - cnt2 + 8
5996     addl(cnt1, cnt2); // cnt2 is negative
5997     addl(cnt1, stride);
5998     movl(cnt2, stride); negptr(cnt2);
5999     bind(CONT_SCAN_SUBSTR);
6000     if (int_cnt2 < (int)G) {
6001       int tail_off1 = int_cnt2<<scale1;
6002       int tail_off2 = int_cnt2<<scale2;
6003       if (ae == StrIntrinsicNode::UL) {
6004         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
6005       } else {
6006         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
6007       }
6008       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
6009     } else {
6010       // calculate index in register to avoid integer overflow (int_cnt2*2)
6011       movl(tmp, int_cnt2);
6012       addptr(tmp, cnt2);
6013       if (ae == StrIntrinsicNode::UL) {
6014         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
6015       } else {
6016         movdqu(vec, Address(str2, tmp, scale2, 0));
6017       }
6018       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
6019     }
6020     // Need to reload strings pointers if not matched whole vector
6021     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
6022     addptr(cnt2, stride);
6023     jcc(Assembler::negative, SCAN_SUBSTR);
6024     // Fall through if found full substring
6025 
6026   } // (int_cnt2 > 8)
6027 
6028   bind(RET_FOUND);
6029   // Found result if we matched full small substring.
6030   // Compute substr offset
6031   subptr(result, str1);
6032   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
6033     shrl(result, 1); // index
6034   }
6035   bind(EXIT);
6036 
6037 } // string_indexofC8
6038 
6039 // Small strings are loaded through stack if they cross page boundary.
6040 void MacroAssembler::string_indexof(Register str1, Register str2,
6041                                     Register cnt1, Register cnt2,
6042                                     int int_cnt2,  Register result,
6043                                     XMMRegister vec, Register tmp,
6044                                     int ae) {
6045   ShortBranchVerifier sbv(this);
6046   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
6047   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
6048 
6049   //
6050   // int_cnt2 is length of small (< 8 chars) constant substring
6051   // or (-1) for non constant substring in which case its length
6052   // is in cnt2 register.
6053   //
6054   // Note, inline_string_indexOf() generates checks:
6055   // if (substr.count > string.count) return -1;
6056   // if (substr.count == 0) return 0;
6057   //
6058   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
6059   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
6060   // This method uses the pcmpestri instruction with bound registers
6061   //   inputs:
6062   //     xmm - substring
6063   //     rax - substring length (elements count)
6064   //     mem - scanned string
6065   //     rdx - string length (elements count)
6066   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
6067   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
6068   //   outputs:
6069   //     rcx - matched index in string
6070   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6071   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
6072   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
6073   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
6074 
6075   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
6076         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
6077         FOUND_CANDIDATE;
6078 
6079   { //========================================================
6080     // We don't know where these strings are located
6081     // and we can't read beyond them. Load them through stack.
6082     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
6083 
6084     movptr(tmp, rsp); // save old SP
6085 
6086     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
6087       if (int_cnt2 == (1>>scale2)) { // One byte
6088         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
6089         load_unsigned_byte(result, Address(str2, 0));
6090         movdl(vec, result); // move 32 bits
6091       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
6092         // Not enough header space in 32-bit VM: 12+3 = 15.
6093         movl(result, Address(str2, -1));
6094         shrl(result, 8);
6095         movdl(vec, result); // move 32 bits
6096       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
6097         load_unsigned_short(result, Address(str2, 0));
6098         movdl(vec, result); // move 32 bits
6099       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
6100         movdl(vec, Address(str2, 0)); // move 32 bits
6101       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
6102         movq(vec, Address(str2, 0));  // move 64 bits
6103       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
6104         // Array header size is 12 bytes in 32-bit VM
6105         // + 6 bytes for 3 chars == 18 bytes,
6106         // enough space to load vec and shift.
6107         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
6108         if (ae == StrIntrinsicNode::UL) {
6109           int tail_off = int_cnt2-8;
6110           pmovzxbw(vec, Address(str2, tail_off));
6111           psrldq(vec, -2*tail_off);
6112         }
6113         else {
6114           int tail_off = int_cnt2*(1<<scale2);
6115           movdqu(vec, Address(str2, tail_off-16));
6116           psrldq(vec, 16-tail_off);
6117         }
6118       }
6119     } else { // not constant substring
6120       cmpl(cnt2, stride);
6121       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
6122 
6123       // We can read beyond string if srt+16 does not cross page boundary
6124       // since heaps are aligned and mapped by pages.
6125       assert(os::vm_page_size() < (int)G, "default page should be small");
6126       movl(result, str2); // We need only low 32 bits
6127       andl(result, (os::vm_page_size()-1));
6128       cmpl(result, (os::vm_page_size()-16));
6129       jccb(Assembler::belowEqual, CHECK_STR);
6130 
6131       // Move small strings to stack to allow load 16 bytes into vec.
6132       subptr(rsp, 16);
6133       int stk_offset = wordSize-(1<<scale2);
6134       push(cnt2);
6135 
6136       bind(COPY_SUBSTR);
6137       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
6138         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
6139         movb(Address(rsp, cnt2, scale2, stk_offset), result);
6140       } else if (ae == StrIntrinsicNode::UU) {
6141         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
6142         movw(Address(rsp, cnt2, scale2, stk_offset), result);
6143       }
6144       decrement(cnt2);
6145       jccb(Assembler::notZero, COPY_SUBSTR);
6146 
6147       pop(cnt2);
6148       movptr(str2, rsp);  // New substring address
6149     } // non constant
6150 
6151     bind(CHECK_STR);
6152     cmpl(cnt1, stride);
6153     jccb(Assembler::aboveEqual, BIG_STRINGS);
6154 
6155     // Check cross page boundary.
6156     movl(result, str1); // We need only low 32 bits
6157     andl(result, (os::vm_page_size()-1));
6158     cmpl(result, (os::vm_page_size()-16));
6159     jccb(Assembler::belowEqual, BIG_STRINGS);
6160 
6161     subptr(rsp, 16);
6162     int stk_offset = -(1<<scale1);
6163     if (int_cnt2 < 0) { // not constant
6164       push(cnt2);
6165       stk_offset += wordSize;
6166     }
6167     movl(cnt2, cnt1);
6168 
6169     bind(COPY_STR);
6170     if (ae == StrIntrinsicNode::LL) {
6171       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
6172       movb(Address(rsp, cnt2, scale1, stk_offset), result);
6173     } else {
6174       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
6175       movw(Address(rsp, cnt2, scale1, stk_offset), result);
6176     }
6177     decrement(cnt2);
6178     jccb(Assembler::notZero, COPY_STR);
6179 
6180     if (int_cnt2 < 0) { // not constant
6181       pop(cnt2);
6182     }
6183     movptr(str1, rsp);  // New string address
6184 
6185     bind(BIG_STRINGS);
6186     // Load substring.
6187     if (int_cnt2 < 0) { // -1
6188       if (ae == StrIntrinsicNode::UL) {
6189         pmovzxbw(vec, Address(str2, 0));
6190       } else {
6191         movdqu(vec, Address(str2, 0));
6192       }
6193       push(cnt2);       // substr count
6194       push(str2);       // substr addr
6195       push(str1);       // string addr
6196     } else {
6197       // Small (< 8 chars) constant substrings are loaded already.
6198       movl(cnt2, int_cnt2);
6199     }
6200     push(tmp);  // original SP
6201 
6202   } // Finished loading
6203 
6204   //========================================================
6205   // Start search
6206   //
6207 
6208   movptr(result, str1); // string addr
6209 
6210   if (int_cnt2  < 0) {  // Only for non constant substring
6211     jmpb(SCAN_TO_SUBSTR);
6212 
6213     // SP saved at sp+0
6214     // String saved at sp+1*wordSize
6215     // Substr saved at sp+2*wordSize
6216     // Substr count saved at sp+3*wordSize
6217 
6218     // Reload substr for rescan, this code
6219     // is executed only for large substrings (> 8 chars)
6220     bind(RELOAD_SUBSTR);
6221     movptr(str2, Address(rsp, 2*wordSize));
6222     movl(cnt2, Address(rsp, 3*wordSize));
6223     if (ae == StrIntrinsicNode::UL) {
6224       pmovzxbw(vec, Address(str2, 0));
6225     } else {
6226       movdqu(vec, Address(str2, 0));
6227     }
6228     // We came here after the beginning of the substring was
6229     // matched but the rest of it was not so we need to search
6230     // again. Start from the next element after the previous match.
6231     subptr(str1, result); // Restore counter
6232     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
6233       shrl(str1, 1);
6234     }
6235     addl(cnt1, str1);
6236     decrementl(cnt1);   // Shift to next element
6237     cmpl(cnt1, cnt2);
6238     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6239 
6240     addptr(result, (1<<scale1));
6241   } // non constant
6242 
6243   // Scan string for start of substr in 16-byte vectors
6244   bind(SCAN_TO_SUBSTR);
6245   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6246   pcmpestri(vec, Address(result, 0), mode);
6247   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
6248   subl(cnt1, stride);
6249   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
6250   cmpl(cnt1, cnt2);
6251   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6252   addptr(result, 16);
6253 
6254   bind(ADJUST_STR);
6255   cmpl(cnt1, stride); // Do not read beyond string
6256   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
6257   // Back-up string to avoid reading beyond string.
6258   lea(result, Address(result, cnt1, scale1, -16));
6259   movl(cnt1, stride);
6260   jmpb(SCAN_TO_SUBSTR);
6261 
6262   // Found a potential substr
6263   bind(FOUND_CANDIDATE);
6264   // After pcmpestri tmp(rcx) contains matched element index
6265 
6266   // Make sure string is still long enough
6267   subl(cnt1, tmp);
6268   cmpl(cnt1, cnt2);
6269   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
6270   // Left less then substring.
6271 
6272   bind(RET_NOT_FOUND);
6273   movl(result, -1);
6274   jmp(CLEANUP);
6275 
6276   bind(FOUND_SUBSTR);
6277   // Compute start addr of substr
6278   lea(result, Address(result, tmp, scale1));
6279   if (int_cnt2 > 0) { // Constant substring
6280     // Repeat search for small substring (< 8 chars)
6281     // from new point without reloading substring.
6282     // Have to check that we don't read beyond string.
6283     cmpl(tmp, stride-int_cnt2);
6284     jccb(Assembler::greater, ADJUST_STR);
6285     // Fall through if matched whole substring.
6286   } else { // non constant
6287     assert(int_cnt2 == -1, "should be != 0");
6288 
6289     addl(tmp, cnt2);
6290     // Found result if we matched whole substring.
6291     cmpl(tmp, stride);
6292     jcc(Assembler::lessEqual, RET_FOUND);
6293 
6294     // Repeat search for small substring (<= 8 chars)
6295     // from new point 'str1' without reloading substring.
6296     cmpl(cnt2, stride);
6297     // Have to check that we don't read beyond string.
6298     jccb(Assembler::lessEqual, ADJUST_STR);
6299 
6300     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
6301     // Compare the rest of substring (> 8 chars).
6302     movptr(str1, result);
6303 
6304     cmpl(tmp, cnt2);
6305     // First 8 chars are already matched.
6306     jccb(Assembler::equal, CHECK_NEXT);
6307 
6308     bind(SCAN_SUBSTR);
6309     pcmpestri(vec, Address(str1, 0), mode);
6310     // Need to reload strings pointers if not matched whole vector
6311     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
6312 
6313     bind(CHECK_NEXT);
6314     subl(cnt2, stride);
6315     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
6316     addptr(str1, 16);
6317     if (ae == StrIntrinsicNode::UL) {
6318       addptr(str2, 8);
6319     } else {
6320       addptr(str2, 16);
6321     }
6322     subl(cnt1, stride);
6323     cmpl(cnt2, stride); // Do not read beyond substring
6324     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
6325     // Back-up strings to avoid reading beyond substring.
6326 
6327     if (ae == StrIntrinsicNode::UL) {
6328       lea(str2, Address(str2, cnt2, scale2, -8));
6329       lea(str1, Address(str1, cnt2, scale1, -16));
6330     } else {
6331       lea(str2, Address(str2, cnt2, scale2, -16));
6332       lea(str1, Address(str1, cnt2, scale1, -16));
6333     }
6334     subl(cnt1, cnt2);
6335     movl(cnt2, stride);
6336     addl(cnt1, stride);
6337     bind(CONT_SCAN_SUBSTR);
6338     if (ae == StrIntrinsicNode::UL) {
6339       pmovzxbw(vec, Address(str2, 0));
6340     } else {
6341       movdqu(vec, Address(str2, 0));
6342     }
6343     jmp(SCAN_SUBSTR);
6344 
6345     bind(RET_FOUND_LONG);
6346     movptr(str1, Address(rsp, wordSize));
6347   } // non constant
6348 
6349   bind(RET_FOUND);
6350   // Compute substr offset
6351   subptr(result, str1);
6352   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
6353     shrl(result, 1); // index
6354   }
6355   bind(CLEANUP);
6356   pop(rsp); // restore SP
6357 
6358 } // string_indexof
6359 
6360 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
6361                                          XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
6362   ShortBranchVerifier sbv(this);
6363   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
6364 
6365   int stride = 8;
6366 
6367   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
6368         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
6369         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
6370         FOUND_SEQ_CHAR, DONE_LABEL;
6371 
6372   movptr(result, str1);
6373   if (UseAVX >= 2) {
6374     cmpl(cnt1, stride);
6375     jcc(Assembler::less, SCAN_TO_CHAR_LOOP);
6376     cmpl(cnt1, 2*stride);
6377     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
6378     movdl(vec1, ch);
6379     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
6380     vpxor(vec2, vec2);
6381     movl(tmp, cnt1);
6382     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
6383     andl(cnt1,0x0000000F);  //tail count (in chars)
6384 
6385     bind(SCAN_TO_16_CHAR_LOOP);
6386     vmovdqu(vec3, Address(result, 0));
6387     vpcmpeqw(vec3, vec3, vec1, 1);
6388     vptest(vec2, vec3);
6389     jcc(Assembler::carryClear, FOUND_CHAR);
6390     addptr(result, 32);
6391     subl(tmp, 2*stride);
6392     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
6393     jmp(SCAN_TO_8_CHAR);
6394     bind(SCAN_TO_8_CHAR_INIT);
6395     movdl(vec1, ch);
6396     pshuflw(vec1, vec1, 0x00);
6397     pshufd(vec1, vec1, 0);
6398     pxor(vec2, vec2);
6399   }
6400   bind(SCAN_TO_8_CHAR);
6401   cmpl(cnt1, stride);
6402   if (UseAVX >= 2) {
6403     jcc(Assembler::less, SCAN_TO_CHAR);
6404   } else {
6405     jcc(Assembler::less, SCAN_TO_CHAR_LOOP);
6406     movdl(vec1, ch);
6407     pshuflw(vec1, vec1, 0x00);
6408     pshufd(vec1, vec1, 0);
6409     pxor(vec2, vec2);
6410   }
6411   movl(tmp, cnt1);
6412   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
6413   andl(cnt1,0x00000007);  //tail count (in chars)
6414 
6415   bind(SCAN_TO_8_CHAR_LOOP);
6416   movdqu(vec3, Address(result, 0));
6417   pcmpeqw(vec3, vec1);
6418   ptest(vec2, vec3);
6419   jcc(Assembler::carryClear, FOUND_CHAR);
6420   addptr(result, 16);
6421   subl(tmp, stride);
6422   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
6423   bind(SCAN_TO_CHAR);
6424   testl(cnt1, cnt1);
6425   jcc(Assembler::zero, RET_NOT_FOUND);
6426   bind(SCAN_TO_CHAR_LOOP);
6427   load_unsigned_short(tmp, Address(result, 0));
6428   cmpl(ch, tmp);
6429   jccb(Assembler::equal, FOUND_SEQ_CHAR);
6430   addptr(result, 2);
6431   subl(cnt1, 1);
6432   jccb(Assembler::zero, RET_NOT_FOUND);
6433   jmp(SCAN_TO_CHAR_LOOP);
6434 
6435   bind(RET_NOT_FOUND);
6436   movl(result, -1);
6437   jmpb(DONE_LABEL);
6438 
6439   bind(FOUND_CHAR);
6440   if (UseAVX >= 2) {
6441     vpmovmskb(tmp, vec3);
6442   } else {
6443     pmovmskb(tmp, vec3);
6444   }
6445   bsfl(ch, tmp);
6446   addl(result, ch);
6447 
6448   bind(FOUND_SEQ_CHAR);
6449   subptr(result, str1);
6450   shrl(result, 1);
6451 
6452   bind(DONE_LABEL);
6453 } // string_indexof_char
6454 
6455 // helper function for string_compare
6456 void MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
6457                                         Address::ScaleFactor scale, Address::ScaleFactor scale1,
6458                                         Address::ScaleFactor scale2, Register index, int ae) {
6459   if (ae == StrIntrinsicNode::LL) {
6460     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
6461     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
6462   } else if (ae == StrIntrinsicNode::UU) {
6463     load_unsigned_short(elem1, Address(str1, index, scale, 0));
6464     load_unsigned_short(elem2, Address(str2, index, scale, 0));
6465   } else {
6466     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
6467     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
6468   }
6469 }
6470 
6471 // Compare strings, used for char[] and byte[].
6472 void MacroAssembler::string_compare(Register str1, Register str2,
6473                                     Register cnt1, Register cnt2, Register result,
6474                                     XMMRegister vec1, int ae) {
6475   ShortBranchVerifier sbv(this);
6476   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
6477   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
6478   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
6479   int stride2x2 = 0x40;
6480   Address::ScaleFactor scale = Address::no_scale;
6481   Address::ScaleFactor scale1 = Address::no_scale;
6482   Address::ScaleFactor scale2 = Address::no_scale;
6483 
6484   if (ae != StrIntrinsicNode::LL) {
6485     stride2x2 = 0x20;
6486   }
6487 
6488   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
6489     shrl(cnt2, 1);
6490   }
6491   // Compute the minimum of the string lengths and the
6492   // difference of the string lengths (stack).
6493   // Do the conditional move stuff
6494   movl(result, cnt1);
6495   subl(cnt1, cnt2);
6496   push(cnt1);
6497   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
6498 
6499   // Is the minimum length zero?
6500   testl(cnt2, cnt2);
6501   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
6502   if (ae == StrIntrinsicNode::LL) {
6503     // Load first bytes
6504     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
6505     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
6506   } else if (ae == StrIntrinsicNode::UU) {
6507     // Load first characters
6508     load_unsigned_short(result, Address(str1, 0));
6509     load_unsigned_short(cnt1, Address(str2, 0));
6510   } else {
6511     load_unsigned_byte(result, Address(str1, 0));
6512     load_unsigned_short(cnt1, Address(str2, 0));
6513   }
6514   subl(result, cnt1);
6515   jcc(Assembler::notZero,  POP_LABEL);
6516 
6517   if (ae == StrIntrinsicNode::UU) {
6518     // Divide length by 2 to get number of chars
6519     shrl(cnt2, 1);
6520   }
6521   cmpl(cnt2, 1);
6522   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
6523 
6524   // Check if the strings start at the same location and setup scale and stride
6525   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6526     cmpptr(str1, str2);
6527     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
6528     if (ae == StrIntrinsicNode::LL) {
6529       scale = Address::times_1;
6530       stride = 16;
6531     } else {
6532       scale = Address::times_2;
6533       stride = 8;
6534     }
6535   } else {
6536     scale1 = Address::times_1;
6537     scale2 = Address::times_2;
6538     // scale not used
6539     stride = 8;
6540   }
6541 
6542   if (UseAVX >= 2 && UseSSE42Intrinsics) {
6543     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
6544     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
6545     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
6546     Label COMPARE_TAIL_LONG;
6547     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
6548 
6549     int pcmpmask = 0x19;
6550     if (ae == StrIntrinsicNode::LL) {
6551       pcmpmask &= ~0x01;
6552     }
6553 
6554     // Setup to compare 16-chars (32-bytes) vectors,
6555     // start from first character again because it has aligned address.
6556     if (ae == StrIntrinsicNode::LL) {
6557       stride2 = 32;
6558     } else {
6559       stride2 = 16;
6560     }
6561     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6562       adr_stride = stride << scale;
6563     } else {
6564       adr_stride1 = 8;  //stride << scale1;
6565       adr_stride2 = 16; //stride << scale2;
6566     }
6567 
6568     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
6569     // rax and rdx are used by pcmpestri as elements counters
6570     movl(result, cnt2);
6571     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
6572     jcc(Assembler::zero, COMPARE_TAIL_LONG);
6573 
6574     // fast path : compare first 2 8-char vectors.
6575     bind(COMPARE_16_CHARS);
6576     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6577       movdqu(vec1, Address(str1, 0));
6578     } else {
6579       pmovzxbw(vec1, Address(str1, 0));
6580     }
6581     pcmpestri(vec1, Address(str2, 0), pcmpmask);
6582     jccb(Assembler::below, COMPARE_INDEX_CHAR);
6583 
6584     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6585       movdqu(vec1, Address(str1, adr_stride));
6586       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
6587     } else {
6588       pmovzxbw(vec1, Address(str1, adr_stride1));
6589       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
6590     }
6591     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
6592     addl(cnt1, stride);
6593 
6594     // Compare the characters at index in cnt1
6595     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
6596     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
6597     subl(result, cnt2);
6598     jmp(POP_LABEL);
6599 
6600     // Setup the registers to start vector comparison loop
6601     bind(COMPARE_WIDE_VECTORS);
6602     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6603       lea(str1, Address(str1, result, scale));
6604       lea(str2, Address(str2, result, scale));
6605     } else {
6606       lea(str1, Address(str1, result, scale1));
6607       lea(str2, Address(str2, result, scale2));
6608     }
6609     subl(result, stride2);
6610     subl(cnt2, stride2);
6611     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
6612     negptr(result);
6613 
6614     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
6615     bind(COMPARE_WIDE_VECTORS_LOOP);
6616 
6617 #ifdef _LP64
6618     if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
6619       cmpl(cnt2, stride2x2);
6620       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
6621       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
6622       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
6623 
6624       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
6625       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6626         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
6627         evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
6628       } else {
6629         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
6630         evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
6631       }
6632       kortestql(k7, k7);
6633       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
6634       addptr(result, stride2x2);  // update since we already compared at this addr
6635       subl(cnt2, stride2x2);      // and sub the size too
6636       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
6637 
6638       vpxor(vec1, vec1);
6639       jmpb(COMPARE_WIDE_TAIL);
6640     }//if (VM_Version::supports_avx512vlbw())
6641 #endif // _LP64
6642 
6643 
6644     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
6645     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6646       vmovdqu(vec1, Address(str1, result, scale));
6647       vpxor(vec1, Address(str2, result, scale));
6648     } else {
6649       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
6650       vpxor(vec1, Address(str2, result, scale2));
6651     }
6652     vptest(vec1, vec1);
6653     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
6654     addptr(result, stride2);
6655     subl(cnt2, stride2);
6656     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
6657     // clean upper bits of YMM registers
6658     vpxor(vec1, vec1);
6659 
6660     // compare wide vectors tail
6661     bind(COMPARE_WIDE_TAIL);
6662     testptr(result, result);
6663     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
6664 
6665     movl(result, stride2);
6666     movl(cnt2, result);
6667     negptr(result);
6668     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
6669 
6670     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
6671     bind(VECTOR_NOT_EQUAL);
6672     // clean upper bits of YMM registers
6673     vpxor(vec1, vec1);
6674     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6675       lea(str1, Address(str1, result, scale));
6676       lea(str2, Address(str2, result, scale));
6677     } else {
6678       lea(str1, Address(str1, result, scale1));
6679       lea(str2, Address(str2, result, scale2));
6680     }
6681     jmp(COMPARE_16_CHARS);
6682 
6683     // Compare tail chars, length between 1 to 15 chars
6684     bind(COMPARE_TAIL_LONG);
6685     movl(cnt2, result);
6686     cmpl(cnt2, stride);
6687     jcc(Assembler::less, COMPARE_SMALL_STR);
6688 
6689     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6690       movdqu(vec1, Address(str1, 0));
6691     } else {
6692       pmovzxbw(vec1, Address(str1, 0));
6693     }
6694     pcmpestri(vec1, Address(str2, 0), pcmpmask);
6695     jcc(Assembler::below, COMPARE_INDEX_CHAR);
6696     subptr(cnt2, stride);
6697     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
6698     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6699       lea(str1, Address(str1, result, scale));
6700       lea(str2, Address(str2, result, scale));
6701     } else {
6702       lea(str1, Address(str1, result, scale1));
6703       lea(str2, Address(str2, result, scale2));
6704     }
6705     negptr(cnt2);
6706     jmpb(WHILE_HEAD_LABEL);
6707 
6708     bind(COMPARE_SMALL_STR);
6709   } else if (UseSSE42Intrinsics) {
6710     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
6711     int pcmpmask = 0x19;
6712     // Setup to compare 8-char (16-byte) vectors,
6713     // start from first character again because it has aligned address.
6714     movl(result, cnt2);
6715     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
6716     if (ae == StrIntrinsicNode::LL) {
6717       pcmpmask &= ~0x01;
6718     }
6719     jcc(Assembler::zero, COMPARE_TAIL);
6720     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6721       lea(str1, Address(str1, result, scale));
6722       lea(str2, Address(str2, result, scale));
6723     } else {
6724       lea(str1, Address(str1, result, scale1));
6725       lea(str2, Address(str2, result, scale2));
6726     }
6727     negptr(result);
6728 
6729     // pcmpestri
6730     //   inputs:
6731     //     vec1- substring
6732     //     rax - negative string length (elements count)
6733     //     mem - scanned string
6734     //     rdx - string length (elements count)
6735     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
6736     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
6737     //   outputs:
6738     //     rcx - first mismatched element index
6739     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
6740 
6741     bind(COMPARE_WIDE_VECTORS);
6742     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6743       movdqu(vec1, Address(str1, result, scale));
6744       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
6745     } else {
6746       pmovzxbw(vec1, Address(str1, result, scale1));
6747       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
6748     }
6749     // After pcmpestri cnt1(rcx) contains mismatched element index
6750 
6751     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
6752     addptr(result, stride);
6753     subptr(cnt2, stride);
6754     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
6755 
6756     // compare wide vectors tail
6757     testptr(result, result);
6758     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
6759 
6760     movl(cnt2, stride);
6761     movl(result, stride);
6762     negptr(result);
6763     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6764       movdqu(vec1, Address(str1, result, scale));
6765       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
6766     } else {
6767       pmovzxbw(vec1, Address(str1, result, scale1));
6768       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
6769     }
6770     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
6771 
6772     // Mismatched characters in the vectors
6773     bind(VECTOR_NOT_EQUAL);
6774     addptr(cnt1, result);
6775     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
6776     subl(result, cnt2);
6777     jmpb(POP_LABEL);
6778 
6779     bind(COMPARE_TAIL); // limit is zero
6780     movl(cnt2, result);
6781     // Fallthru to tail compare
6782   }
6783   // Shift str2 and str1 to the end of the arrays, negate min
6784   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6785     lea(str1, Address(str1, cnt2, scale));
6786     lea(str2, Address(str2, cnt2, scale));
6787   } else {
6788     lea(str1, Address(str1, cnt2, scale1));
6789     lea(str2, Address(str2, cnt2, scale2));
6790   }
6791   decrementl(cnt2);  // first character was compared already
6792   negptr(cnt2);
6793 
6794   // Compare the rest of the elements
6795   bind(WHILE_HEAD_LABEL);
6796   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
6797   subl(result, cnt1);
6798   jccb(Assembler::notZero, POP_LABEL);
6799   increment(cnt2);
6800   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
6801 
6802   // Strings are equal up to min length.  Return the length difference.
6803   bind(LENGTH_DIFF_LABEL);
6804   pop(result);
6805   if (ae == StrIntrinsicNode::UU) {
6806     // Divide diff by 2 to get number of chars
6807     sarl(result, 1);
6808   }
6809   jmpb(DONE_LABEL);
6810 
6811 #ifdef _LP64
6812   if (VM_Version::supports_avx512vlbw()) {
6813 
6814     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
6815 
6816     kmovql(cnt1, k7);
6817     notq(cnt1);
6818     bsfq(cnt2, cnt1);
6819     if (ae != StrIntrinsicNode::LL) {
6820       // Divide diff by 2 to get number of chars
6821       sarl(cnt2, 1);
6822     }
6823     addq(result, cnt2);
6824     if (ae == StrIntrinsicNode::LL) {
6825       load_unsigned_byte(cnt1, Address(str2, result));
6826       load_unsigned_byte(result, Address(str1, result));
6827     } else if (ae == StrIntrinsicNode::UU) {
6828       load_unsigned_short(cnt1, Address(str2, result, scale));
6829       load_unsigned_short(result, Address(str1, result, scale));
6830     } else {
6831       load_unsigned_short(cnt1, Address(str2, result, scale2));
6832       load_unsigned_byte(result, Address(str1, result, scale1));
6833     }
6834     subl(result, cnt1);
6835     jmpb(POP_LABEL);
6836   }//if (VM_Version::supports_avx512vlbw())
6837 #endif // _LP64
6838 
6839   // Discard the stored length difference
6840   bind(POP_LABEL);
6841   pop(cnt1);
6842 
6843   // That's it
6844   bind(DONE_LABEL);
6845   if(ae == StrIntrinsicNode::UL) {
6846     negl(result);
6847   }
6848 
6849 }
6850 
6851 // Search for Non-ASCII character (Negative byte value) in a byte array,
6852 // return true if it has any and false otherwise.
6853 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
6854 //   @HotSpotIntrinsicCandidate
6855 //   private static boolean hasNegatives(byte[] ba, int off, int len) {
6856 //     for (int i = off; i < off + len; i++) {
6857 //       if (ba[i] < 0) {
6858 //         return true;
6859 //       }
6860 //     }
6861 //     return false;
6862 //   }
6863 void MacroAssembler::has_negatives(Register ary1, Register len,
6864   Register result, Register tmp1,
6865   XMMRegister vec1, XMMRegister vec2) {
6866   // rsi: byte array
6867   // rcx: len
6868   // rax: result
6869   ShortBranchVerifier sbv(this);
6870   assert_different_registers(ary1, len, result, tmp1);
6871   assert_different_registers(vec1, vec2);
6872   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
6873 
6874   // len == 0
6875   testl(len, len);
6876   jcc(Assembler::zero, FALSE_LABEL);
6877 
6878   if ((UseAVX > 2) && // AVX512
6879     VM_Version::supports_avx512vlbw() &&
6880     VM_Version::supports_bmi2()) {
6881 
6882     Label test_64_loop, test_tail;
6883     Register tmp3_aliased = len;
6884 
6885     movl(tmp1, len);
6886     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
6887 
6888     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
6889     andl(len, ~(64 - 1));    // vector count (in chars)
6890     jccb(Assembler::zero, test_tail);
6891 
6892     lea(ary1, Address(ary1, len, Address::times_1));
6893     negptr(len);
6894 
6895     bind(test_64_loop);
6896     // Check whether our 64 elements of size byte contain negatives
6897     evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
6898     kortestql(k2, k2);
6899     jcc(Assembler::notZero, TRUE_LABEL);
6900 
6901     addptr(len, 64);
6902     jccb(Assembler::notZero, test_64_loop);
6903 
6904 
6905     bind(test_tail);
6906     // bail out when there is nothing to be done
6907     testl(tmp1, -1);
6908     jcc(Assembler::zero, FALSE_LABEL);
6909 
6910     // ~(~0 << len) applied up to two times (for 32-bit scenario)
6911 #ifdef _LP64
6912     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
6913     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
6914     notq(tmp3_aliased);
6915     kmovql(k3, tmp3_aliased);
6916 #else
6917     Label k_init;
6918     jmp(k_init);
6919 
6920     // We could not read 64-bits from a general purpose register thus we move
6921     // data required to compose 64 1's to the instruction stream
6922     // We emit 64 byte wide series of elements from 0..63 which later on would
6923     // be used as a compare targets with tail count contained in tmp1 register.
6924     // Result would be a k register having tmp1 consecutive number or 1
6925     // counting from least significant bit.
6926     address tmp = pc();
6927     emit_int64(0x0706050403020100);
6928     emit_int64(0x0F0E0D0C0B0A0908);
6929     emit_int64(0x1716151413121110);
6930     emit_int64(0x1F1E1D1C1B1A1918);
6931     emit_int64(0x2726252423222120);
6932     emit_int64(0x2F2E2D2C2B2A2928);
6933     emit_int64(0x3736353433323130);
6934     emit_int64(0x3F3E3D3C3B3A3938);
6935 
6936     bind(k_init);
6937     lea(len, InternalAddress(tmp));
6938     // create mask to test for negative byte inside a vector
6939     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
6940     evpcmpgtb(k3, vec1, Address(len, 0), Assembler::AVX_512bit);
6941 
6942 #endif
6943     evpcmpgtb(k2, k3, vec2, Address(ary1, 0), Assembler::AVX_512bit);
6944     ktestq(k2, k3);
6945     jcc(Assembler::notZero, TRUE_LABEL);
6946 
6947     jmp(FALSE_LABEL);
6948   } else {
6949     movl(result, len); // copy
6950 
6951     if (UseAVX == 2 && UseSSE >= 2) {
6952       // With AVX2, use 32-byte vector compare
6953       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
6954 
6955       // Compare 32-byte vectors
6956       andl(result, 0x0000001f);  //   tail count (in bytes)
6957       andl(len, 0xffffffe0);   // vector count (in bytes)
6958       jccb(Assembler::zero, COMPARE_TAIL);
6959 
6960       lea(ary1, Address(ary1, len, Address::times_1));
6961       negptr(len);
6962 
6963       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
6964       movdl(vec2, tmp1);
6965       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
6966 
6967       bind(COMPARE_WIDE_VECTORS);
6968       vmovdqu(vec1, Address(ary1, len, Address::times_1));
6969       vptest(vec1, vec2);
6970       jccb(Assembler::notZero, TRUE_LABEL);
6971       addptr(len, 32);
6972       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
6973 
6974       testl(result, result);
6975       jccb(Assembler::zero, FALSE_LABEL);
6976 
6977       vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
6978       vptest(vec1, vec2);
6979       jccb(Assembler::notZero, TRUE_LABEL);
6980       jmpb(FALSE_LABEL);
6981 
6982       bind(COMPARE_TAIL); // len is zero
6983       movl(len, result);
6984       // Fallthru to tail compare
6985     } else if (UseSSE42Intrinsics) {
6986       // With SSE4.2, use double quad vector compare
6987       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
6988 
6989       // Compare 16-byte vectors
6990       andl(result, 0x0000000f);  //   tail count (in bytes)
6991       andl(len, 0xfffffff0);   // vector count (in bytes)
6992       jcc(Assembler::zero, COMPARE_TAIL);
6993 
6994       lea(ary1, Address(ary1, len, Address::times_1));
6995       negptr(len);
6996 
6997       movl(tmp1, 0x80808080);
6998       movdl(vec2, tmp1);
6999       pshufd(vec2, vec2, 0);
7000 
7001       bind(COMPARE_WIDE_VECTORS);
7002       movdqu(vec1, Address(ary1, len, Address::times_1));
7003       ptest(vec1, vec2);
7004       jcc(Assembler::notZero, TRUE_LABEL);
7005       addptr(len, 16);
7006       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7007 
7008       testl(result, result);
7009       jcc(Assembler::zero, FALSE_LABEL);
7010 
7011       movdqu(vec1, Address(ary1, result, Address::times_1, -16));
7012       ptest(vec1, vec2);
7013       jccb(Assembler::notZero, TRUE_LABEL);
7014       jmpb(FALSE_LABEL);
7015 
7016       bind(COMPARE_TAIL); // len is zero
7017       movl(len, result);
7018       // Fallthru to tail compare
7019     }
7020   }
7021   // Compare 4-byte vectors
7022   andl(len, 0xfffffffc); // vector count (in bytes)
7023   jccb(Assembler::zero, COMPARE_CHAR);
7024 
7025   lea(ary1, Address(ary1, len, Address::times_1));
7026   negptr(len);
7027 
7028   bind(COMPARE_VECTORS);
7029   movl(tmp1, Address(ary1, len, Address::times_1));
7030   andl(tmp1, 0x80808080);
7031   jccb(Assembler::notZero, TRUE_LABEL);
7032   addptr(len, 4);
7033   jcc(Assembler::notZero, COMPARE_VECTORS);
7034 
7035   // Compare trailing char (final 2 bytes), if any
7036   bind(COMPARE_CHAR);
7037   testl(result, 0x2);   // tail  char
7038   jccb(Assembler::zero, COMPARE_BYTE);
7039   load_unsigned_short(tmp1, Address(ary1, 0));
7040   andl(tmp1, 0x00008080);
7041   jccb(Assembler::notZero, TRUE_LABEL);
7042   subptr(result, 2);
7043   lea(ary1, Address(ary1, 2));
7044 
7045   bind(COMPARE_BYTE);
7046   testl(result, 0x1);   // tail  byte
7047   jccb(Assembler::zero, FALSE_LABEL);
7048   load_unsigned_byte(tmp1, Address(ary1, 0));
7049   andl(tmp1, 0x00000080);
7050   jccb(Assembler::notEqual, TRUE_LABEL);
7051   jmpb(FALSE_LABEL);
7052 
7053   bind(TRUE_LABEL);
7054   movl(result, 1);   // return true
7055   jmpb(DONE);
7056 
7057   bind(FALSE_LABEL);
7058   xorl(result, result); // return false
7059 
7060   // That's it
7061   bind(DONE);
7062   if (UseAVX >= 2 && UseSSE >= 2) {
7063     // clean upper bits of YMM registers
7064     vpxor(vec1, vec1);
7065     vpxor(vec2, vec2);
7066   }
7067 }
7068 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
7069 void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
7070                                    Register limit, Register result, Register chr,
7071                                    XMMRegister vec1, XMMRegister vec2, bool is_char) {
7072   ShortBranchVerifier sbv(this);
7073   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
7074 
7075   int length_offset  = arrayOopDesc::length_offset_in_bytes();
7076   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
7077 
7078   if (is_array_equ) {
7079     // Check the input args
7080     cmpoop(ary1, ary2);
7081     jcc(Assembler::equal, TRUE_LABEL);
7082 
7083     // Need additional checks for arrays_equals.
7084     testptr(ary1, ary1);
7085     jcc(Assembler::zero, FALSE_LABEL);
7086     testptr(ary2, ary2);
7087     jcc(Assembler::zero, FALSE_LABEL);
7088 
7089     // Check the lengths
7090     movl(limit, Address(ary1, length_offset));
7091     cmpl(limit, Address(ary2, length_offset));
7092     jcc(Assembler::notEqual, FALSE_LABEL);
7093   }
7094 
7095   // count == 0
7096   testl(limit, limit);
7097   jcc(Assembler::zero, TRUE_LABEL);
7098 
7099   if (is_array_equ) {
7100     // Load array address
7101     lea(ary1, Address(ary1, base_offset));
7102     lea(ary2, Address(ary2, base_offset));
7103   }
7104 
7105   if (is_array_equ && is_char) {
7106     // arrays_equals when used for char[].
7107     shll(limit, 1);      // byte count != 0
7108   }
7109   movl(result, limit); // copy
7110 
7111   if (UseAVX >= 2) {
7112     // With AVX2, use 32-byte vector compare
7113     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7114 
7115     // Compare 32-byte vectors
7116     andl(result, 0x0000001f);  //   tail count (in bytes)
7117     andl(limit, 0xffffffe0);   // vector count (in bytes)
7118     jcc(Assembler::zero, COMPARE_TAIL);
7119 
7120     lea(ary1, Address(ary1, limit, Address::times_1));
7121     lea(ary2, Address(ary2, limit, Address::times_1));
7122     negptr(limit);
7123 
7124     bind(COMPARE_WIDE_VECTORS);
7125 
7126 #ifdef _LP64
7127     if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
7128       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
7129 
7130       cmpl(limit, -64);
7131       jccb(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
7132 
7133       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
7134 
7135       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
7136       evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
7137       kortestql(k7, k7);
7138       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
7139       addptr(limit, 64);  // update since we already compared at this addr
7140       cmpl(limit, -64);
7141       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
7142 
7143       // At this point we may still need to compare -limit+result bytes.
7144       // We could execute the next two instruction and just continue via non-wide path:
7145       //  cmpl(limit, 0);
7146       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
7147       // But since we stopped at the points ary{1,2}+limit which are
7148       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
7149       // (|limit| <= 32 and result < 32),
7150       // we may just compare the last 64 bytes.
7151       //
7152       addptr(result, -64);   // it is safe, bc we just came from this area
7153       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
7154       evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
7155       kortestql(k7, k7);
7156       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
7157 
7158       jmp(TRUE_LABEL);
7159 
7160       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
7161 
7162     }//if (VM_Version::supports_avx512vlbw())
7163 #endif //_LP64
7164 
7165     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
7166     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
7167     vpxor(vec1, vec2);
7168 
7169     vptest(vec1, vec1);
7170     jcc(Assembler::notZero, FALSE_LABEL);
7171     addptr(limit, 32);
7172     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7173 
7174     testl(result, result);
7175     jcc(Assembler::zero, TRUE_LABEL);
7176 
7177     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
7178     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
7179     vpxor(vec1, vec2);
7180 
7181     vptest(vec1, vec1);
7182     jccb(Assembler::notZero, FALSE_LABEL);
7183     jmpb(TRUE_LABEL);
7184 
7185     bind(COMPARE_TAIL); // limit is zero
7186     movl(limit, result);
7187     // Fallthru to tail compare
7188   } else if (UseSSE42Intrinsics) {
7189     // With SSE4.2, use double quad vector compare
7190     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7191 
7192     // Compare 16-byte vectors
7193     andl(result, 0x0000000f);  //   tail count (in bytes)
7194     andl(limit, 0xfffffff0);   // vector count (in bytes)
7195     jcc(Assembler::zero, COMPARE_TAIL);
7196 
7197     lea(ary1, Address(ary1, limit, Address::times_1));
7198     lea(ary2, Address(ary2, limit, Address::times_1));
7199     negptr(limit);
7200 
7201     bind(COMPARE_WIDE_VECTORS);
7202     movdqu(vec1, Address(ary1, limit, Address::times_1));
7203     movdqu(vec2, Address(ary2, limit, Address::times_1));
7204     pxor(vec1, vec2);
7205 
7206     ptest(vec1, vec1);
7207     jcc(Assembler::notZero, FALSE_LABEL);
7208     addptr(limit, 16);
7209     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7210 
7211     testl(result, result);
7212     jcc(Assembler::zero, TRUE_LABEL);
7213 
7214     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
7215     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
7216     pxor(vec1, vec2);
7217 
7218     ptest(vec1, vec1);
7219     jccb(Assembler::notZero, FALSE_LABEL);
7220     jmpb(TRUE_LABEL);
7221 
7222     bind(COMPARE_TAIL); // limit is zero
7223     movl(limit, result);
7224     // Fallthru to tail compare
7225   }
7226 
7227   // Compare 4-byte vectors
7228   andl(limit, 0xfffffffc); // vector count (in bytes)
7229   jccb(Assembler::zero, COMPARE_CHAR);
7230 
7231   lea(ary1, Address(ary1, limit, Address::times_1));
7232   lea(ary2, Address(ary2, limit, Address::times_1));
7233   negptr(limit);
7234 
7235   bind(COMPARE_VECTORS);
7236   movl(chr, Address(ary1, limit, Address::times_1));
7237   cmpl(chr, Address(ary2, limit, Address::times_1));
7238   jccb(Assembler::notEqual, FALSE_LABEL);
7239   addptr(limit, 4);
7240   jcc(Assembler::notZero, COMPARE_VECTORS);
7241 
7242   // Compare trailing char (final 2 bytes), if any
7243   bind(COMPARE_CHAR);
7244   testl(result, 0x2);   // tail  char
7245   jccb(Assembler::zero, COMPARE_BYTE);
7246   load_unsigned_short(chr, Address(ary1, 0));
7247   load_unsigned_short(limit, Address(ary2, 0));
7248   cmpl(chr, limit);
7249   jccb(Assembler::notEqual, FALSE_LABEL);
7250 
7251   if (is_array_equ && is_char) {
7252     bind(COMPARE_BYTE);
7253   } else {
7254     lea(ary1, Address(ary1, 2));
7255     lea(ary2, Address(ary2, 2));
7256 
7257     bind(COMPARE_BYTE);
7258     testl(result, 0x1);   // tail  byte
7259     jccb(Assembler::zero, TRUE_LABEL);
7260     load_unsigned_byte(chr, Address(ary1, 0));
7261     load_unsigned_byte(limit, Address(ary2, 0));
7262     cmpl(chr, limit);
7263     jccb(Assembler::notEqual, FALSE_LABEL);
7264   }
7265   bind(TRUE_LABEL);
7266   movl(result, 1);   // return true
7267   jmpb(DONE);
7268 
7269   bind(FALSE_LABEL);
7270   xorl(result, result); // return false
7271 
7272   // That's it
7273   bind(DONE);
7274   if (UseAVX >= 2) {
7275     // clean upper bits of YMM registers
7276     vpxor(vec1, vec1);
7277     vpxor(vec2, vec2);
7278   }
7279 }
7280 
7281 #endif
7282 
7283 void MacroAssembler::generate_fill(BasicType t, bool aligned,
7284                                    Register to, Register value, Register count,
7285                                    Register rtmp, XMMRegister xtmp) {
7286   ShortBranchVerifier sbv(this);
7287   assert_different_registers(to, value, count, rtmp);
7288   Label L_exit;
7289   Label L_fill_2_bytes, L_fill_4_bytes;
7290 
7291   int shift = -1;
7292   switch (t) {
7293     case T_BYTE:
7294       shift = 2;
7295       break;
7296     case T_SHORT:
7297       shift = 1;
7298       break;
7299     case T_INT:
7300       shift = 0;
7301       break;
7302     default: ShouldNotReachHere();
7303   }
7304 
7305   if (t == T_BYTE) {
7306     andl(value, 0xff);
7307     movl(rtmp, value);
7308     shll(rtmp, 8);
7309     orl(value, rtmp);
7310   }
7311   if (t == T_SHORT) {
7312     andl(value, 0xffff);
7313   }
7314   if (t == T_BYTE || t == T_SHORT) {
7315     movl(rtmp, value);
7316     shll(rtmp, 16);
7317     orl(value, rtmp);
7318   }
7319 
7320   cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
7321   jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
7322   if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
7323     Label L_skip_align2;
7324     // align source address at 4 bytes address boundary
7325     if (t == T_BYTE) {
7326       Label L_skip_align1;
7327       // One byte misalignment happens only for byte arrays
7328       testptr(to, 1);
7329       jccb(Assembler::zero, L_skip_align1);
7330       movb(Address(to, 0), value);
7331       increment(to);
7332       decrement(count);
7333       BIND(L_skip_align1);
7334     }
7335     // Two bytes misalignment happens only for byte and short (char) arrays
7336     testptr(to, 2);
7337     jccb(Assembler::zero, L_skip_align2);
7338     movw(Address(to, 0), value);
7339     addptr(to, 2);
7340     subl(count, 1<<(shift-1));
7341     BIND(L_skip_align2);
7342   }
7343   if (UseSSE < 2) {
7344     Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
7345     // Fill 32-byte chunks
7346     subl(count, 8 << shift);
7347     jcc(Assembler::less, L_check_fill_8_bytes);
7348     align(16);
7349 
7350     BIND(L_fill_32_bytes_loop);
7351 
7352     for (int i = 0; i < 32; i += 4) {
7353       movl(Address(to, i), value);
7354     }
7355 
7356     addptr(to, 32);
7357     subl(count, 8 << shift);
7358     jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
7359     BIND(L_check_fill_8_bytes);
7360     addl(count, 8 << shift);
7361     jccb(Assembler::zero, L_exit);
7362     jmpb(L_fill_8_bytes);
7363 
7364     //
7365     // length is too short, just fill qwords
7366     //
7367     BIND(L_fill_8_bytes_loop);
7368     movl(Address(to, 0), value);
7369     movl(Address(to, 4), value);
7370     addptr(to, 8);
7371     BIND(L_fill_8_bytes);
7372     subl(count, 1 << (shift + 1));
7373     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7374     // fall through to fill 4 bytes
7375   } else {
7376     Label L_fill_32_bytes;
7377     if (!UseUnalignedLoadStores) {
7378       // align to 8 bytes, we know we are 4 byte aligned to start
7379       testptr(to, 4);
7380       jccb(Assembler::zero, L_fill_32_bytes);
7381       movl(Address(to, 0), value);
7382       addptr(to, 4);
7383       subl(count, 1<<shift);
7384     }
7385     BIND(L_fill_32_bytes);
7386     {
7387       assert( UseSSE >= 2, "supported cpu only" );
7388       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
7389       movdl(xtmp, value);
7390       if (UseAVX > 2 && UseUnalignedLoadStores) {
7391         // Fill 64-byte chunks
7392         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7393         vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
7394 
7395         subl(count, 16 << shift);
7396         jcc(Assembler::less, L_check_fill_32_bytes);
7397         align(16);
7398 
7399         BIND(L_fill_64_bytes_loop);
7400         evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
7401         addptr(to, 64);
7402         subl(count, 16 << shift);
7403         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7404 
7405         BIND(L_check_fill_32_bytes);
7406         addl(count, 8 << shift);
7407         jccb(Assembler::less, L_check_fill_8_bytes);
7408         vmovdqu(Address(to, 0), xtmp);
7409         addptr(to, 32);
7410         subl(count, 8 << shift);
7411 
7412         BIND(L_check_fill_8_bytes);
7413       } else if (UseAVX == 2 && UseUnalignedLoadStores) {
7414         // Fill 64-byte chunks
7415         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7416         vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
7417 
7418         subl(count, 16 << shift);
7419         jcc(Assembler::less, L_check_fill_32_bytes);
7420         align(16);
7421 
7422         BIND(L_fill_64_bytes_loop);
7423         vmovdqu(Address(to, 0), xtmp);
7424         vmovdqu(Address(to, 32), xtmp);
7425         addptr(to, 64);
7426         subl(count, 16 << shift);
7427         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7428 
7429         BIND(L_check_fill_32_bytes);
7430         addl(count, 8 << shift);
7431         jccb(Assembler::less, L_check_fill_8_bytes);
7432         vmovdqu(Address(to, 0), xtmp);
7433         addptr(to, 32);
7434         subl(count, 8 << shift);
7435 
7436         BIND(L_check_fill_8_bytes);
7437         // clean upper bits of YMM registers
7438         movdl(xtmp, value);
7439         pshufd(xtmp, xtmp, 0);
7440       } else {
7441         // Fill 32-byte chunks
7442         pshufd(xtmp, xtmp, 0);
7443 
7444         subl(count, 8 << shift);
7445         jcc(Assembler::less, L_check_fill_8_bytes);
7446         align(16);
7447 
7448         BIND(L_fill_32_bytes_loop);
7449 
7450         if (UseUnalignedLoadStores) {
7451           movdqu(Address(to, 0), xtmp);
7452           movdqu(Address(to, 16), xtmp);
7453         } else {
7454           movq(Address(to, 0), xtmp);
7455           movq(Address(to, 8), xtmp);
7456           movq(Address(to, 16), xtmp);
7457           movq(Address(to, 24), xtmp);
7458         }
7459 
7460         addptr(to, 32);
7461         subl(count, 8 << shift);
7462         jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
7463 
7464         BIND(L_check_fill_8_bytes);
7465       }
7466       addl(count, 8 << shift);
7467       jccb(Assembler::zero, L_exit);
7468       jmpb(L_fill_8_bytes);
7469 
7470       //
7471       // length is too short, just fill qwords
7472       //
7473       BIND(L_fill_8_bytes_loop);
7474       movq(Address(to, 0), xtmp);
7475       addptr(to, 8);
7476       BIND(L_fill_8_bytes);
7477       subl(count, 1 << (shift + 1));
7478       jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7479     }
7480   }
7481   // fill trailing 4 bytes
7482   BIND(L_fill_4_bytes);
7483   testl(count, 1<<shift);
7484   jccb(Assembler::zero, L_fill_2_bytes);
7485   movl(Address(to, 0), value);
7486   if (t == T_BYTE || t == T_SHORT) {
7487     Label L_fill_byte;
7488     addptr(to, 4);
7489     BIND(L_fill_2_bytes);
7490     // fill trailing 2 bytes
7491     testl(count, 1<<(shift-1));
7492     jccb(Assembler::zero, L_fill_byte);
7493     movw(Address(to, 0), value);
7494     if (t == T_BYTE) {
7495       addptr(to, 2);
7496       BIND(L_fill_byte);
7497       // fill trailing byte
7498       testl(count, 1);
7499       jccb(Assembler::zero, L_exit);
7500       movb(Address(to, 0), value);
7501     } else {
7502       BIND(L_fill_byte);
7503     }
7504   } else {
7505     BIND(L_fill_2_bytes);
7506   }
7507   BIND(L_exit);
7508 }
7509 
7510 // encode char[] to byte[] in ISO_8859_1
7511    //@HotSpotIntrinsicCandidate
7512    //private static int implEncodeISOArray(byte[] sa, int sp,
7513    //byte[] da, int dp, int len) {
7514    //  int i = 0;
7515    //  for (; i < len; i++) {
7516    //    char c = StringUTF16.getChar(sa, sp++);
7517    //    if (c > '\u00FF')
7518    //      break;
7519    //    da[dp++] = (byte)c;
7520    //  }
7521    //  return i;
7522    //}
7523 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
7524   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
7525   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
7526   Register tmp5, Register result) {
7527 
7528   // rsi: src
7529   // rdi: dst
7530   // rdx: len
7531   // rcx: tmp5
7532   // rax: result
7533   ShortBranchVerifier sbv(this);
7534   assert_different_registers(src, dst, len, tmp5, result);
7535   Label L_done, L_copy_1_char, L_copy_1_char_exit;
7536 
7537   // set result
7538   xorl(result, result);
7539   // check for zero length
7540   testl(len, len);
7541   jcc(Assembler::zero, L_done);
7542 
7543   movl(result, len);
7544 
7545   // Setup pointers
7546   lea(src, Address(src, len, Address::times_2)); // char[]
7547   lea(dst, Address(dst, len, Address::times_1)); // byte[]
7548   negptr(len);
7549 
7550   if (UseSSE42Intrinsics || UseAVX >= 2) {
7551     Label L_copy_8_chars, L_copy_8_chars_exit;
7552     Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
7553 
7554     if (UseAVX >= 2) {
7555       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
7556       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
7557       movdl(tmp1Reg, tmp5);
7558       vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
7559       jmp(L_chars_32_check);
7560 
7561       bind(L_copy_32_chars);
7562       vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
7563       vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
7564       vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
7565       vptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
7566       jccb(Assembler::notZero, L_copy_32_chars_exit);
7567       vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
7568       vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
7569       vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
7570 
7571       bind(L_chars_32_check);
7572       addptr(len, 32);
7573       jcc(Assembler::lessEqual, L_copy_32_chars);
7574 
7575       bind(L_copy_32_chars_exit);
7576       subptr(len, 16);
7577       jccb(Assembler::greater, L_copy_16_chars_exit);
7578 
7579     } else if (UseSSE42Intrinsics) {
7580       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
7581       movdl(tmp1Reg, tmp5);
7582       pshufd(tmp1Reg, tmp1Reg, 0);
7583       jmpb(L_chars_16_check);
7584     }
7585 
7586     bind(L_copy_16_chars);
7587     if (UseAVX >= 2) {
7588       vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
7589       vptest(tmp2Reg, tmp1Reg);
7590       jcc(Assembler::notZero, L_copy_16_chars_exit);
7591       vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
7592       vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
7593     } else {
7594       if (UseAVX > 0) {
7595         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7596         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7597         vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
7598       } else {
7599         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7600         por(tmp2Reg, tmp3Reg);
7601         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7602         por(tmp2Reg, tmp4Reg);
7603       }
7604       ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
7605       jccb(Assembler::notZero, L_copy_16_chars_exit);
7606       packuswb(tmp3Reg, tmp4Reg);
7607     }
7608     movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
7609 
7610     bind(L_chars_16_check);
7611     addptr(len, 16);
7612     jcc(Assembler::lessEqual, L_copy_16_chars);
7613 
7614     bind(L_copy_16_chars_exit);
7615     if (UseAVX >= 2) {
7616       // clean upper bits of YMM registers
7617       vpxor(tmp2Reg, tmp2Reg);
7618       vpxor(tmp3Reg, tmp3Reg);
7619       vpxor(tmp4Reg, tmp4Reg);
7620       movdl(tmp1Reg, tmp5);
7621       pshufd(tmp1Reg, tmp1Reg, 0);
7622     }
7623     subptr(len, 8);
7624     jccb(Assembler::greater, L_copy_8_chars_exit);
7625 
7626     bind(L_copy_8_chars);
7627     movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
7628     ptest(tmp3Reg, tmp1Reg);
7629     jccb(Assembler::notZero, L_copy_8_chars_exit);
7630     packuswb(tmp3Reg, tmp1Reg);
7631     movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
7632     addptr(len, 8);
7633     jccb(Assembler::lessEqual, L_copy_8_chars);
7634 
7635     bind(L_copy_8_chars_exit);
7636     subptr(len, 8);
7637     jccb(Assembler::zero, L_done);
7638   }
7639 
7640   bind(L_copy_1_char);
7641   load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
7642   testl(tmp5, 0xff00);      // check if Unicode char
7643   jccb(Assembler::notZero, L_copy_1_char_exit);
7644   movb(Address(dst, len, Address::times_1, 0), tmp5);
7645   addptr(len, 1);
7646   jccb(Assembler::less, L_copy_1_char);
7647 
7648   bind(L_copy_1_char_exit);
7649   addptr(result, len); // len is negative count of not processed elements
7650 
7651   bind(L_done);
7652 }
7653 
7654 #ifdef _LP64
7655 /**
7656  * Helper for multiply_to_len().
7657  */
7658 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
7659   addq(dest_lo, src1);
7660   adcq(dest_hi, 0);
7661   addq(dest_lo, src2);
7662   adcq(dest_hi, 0);
7663 }
7664 
7665 /**
7666  * Multiply 64 bit by 64 bit first loop.
7667  */
7668 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
7669                                            Register y, Register y_idx, Register z,
7670                                            Register carry, Register product,
7671                                            Register idx, Register kdx) {
7672   //
7673   //  jlong carry, x[], y[], z[];
7674   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
7675   //    huge_128 product = y[idx] * x[xstart] + carry;
7676   //    z[kdx] = (jlong)product;
7677   //    carry  = (jlong)(product >>> 64);
7678   //  }
7679   //  z[xstart] = carry;
7680   //
7681 
7682   Label L_first_loop, L_first_loop_exit;
7683   Label L_one_x, L_one_y, L_multiply;
7684 
7685   decrementl(xstart);
7686   jcc(Assembler::negative, L_one_x);
7687 
7688   movq(x_xstart, Address(x, xstart, Address::times_4,  0));
7689   rorq(x_xstart, 32); // convert big-endian to little-endian
7690 
7691   bind(L_first_loop);
7692   decrementl(idx);
7693   jcc(Assembler::negative, L_first_loop_exit);
7694   decrementl(idx);
7695   jcc(Assembler::negative, L_one_y);
7696   movq(y_idx, Address(y, idx, Address::times_4,  0));
7697   rorq(y_idx, 32); // convert big-endian to little-endian
7698   bind(L_multiply);
7699   movq(product, x_xstart);
7700   mulq(y_idx); // product(rax) * y_idx -> rdx:rax
7701   addq(product, carry);
7702   adcq(rdx, 0);
7703   subl(kdx, 2);
7704   movl(Address(z, kdx, Address::times_4,  4), product);
7705   shrq(product, 32);
7706   movl(Address(z, kdx, Address::times_4,  0), product);
7707   movq(carry, rdx);
7708   jmp(L_first_loop);
7709 
7710   bind(L_one_y);
7711   movl(y_idx, Address(y,  0));
7712   jmp(L_multiply);
7713 
7714   bind(L_one_x);
7715   movl(x_xstart, Address(x,  0));
7716   jmp(L_first_loop);
7717 
7718   bind(L_first_loop_exit);
7719 }
7720 
7721 /**
7722  * Multiply 64 bit by 64 bit and add 128 bit.
7723  */
7724 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
7725                                             Register yz_idx, Register idx,
7726                                             Register carry, Register product, int offset) {
7727   //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
7728   //     z[kdx] = (jlong)product;
7729 
7730   movq(yz_idx, Address(y, idx, Address::times_4,  offset));
7731   rorq(yz_idx, 32); // convert big-endian to little-endian
7732   movq(product, x_xstart);
7733   mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
7734   movq(yz_idx, Address(z, idx, Address::times_4,  offset));
7735   rorq(yz_idx, 32); // convert big-endian to little-endian
7736 
7737   add2_with_carry(rdx, product, carry, yz_idx);
7738 
7739   movl(Address(z, idx, Address::times_4,  offset+4), product);
7740   shrq(product, 32);
7741   movl(Address(z, idx, Address::times_4,  offset), product);
7742 
7743 }
7744 
7745 /**
7746  * Multiply 128 bit by 128 bit. Unrolled inner loop.
7747  */
7748 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
7749                                              Register yz_idx, Register idx, Register jdx,
7750                                              Register carry, Register product,
7751                                              Register carry2) {
7752   //   jlong carry, x[], y[], z[];
7753   //   int kdx = ystart+1;
7754   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
7755   //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
7756   //     z[kdx+idx+1] = (jlong)product;
7757   //     jlong carry2  = (jlong)(product >>> 64);
7758   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
7759   //     z[kdx+idx] = (jlong)product;
7760   //     carry  = (jlong)(product >>> 64);
7761   //   }
7762   //   idx += 2;
7763   //   if (idx > 0) {
7764   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
7765   //     z[kdx+idx] = (jlong)product;
7766   //     carry  = (jlong)(product >>> 64);
7767   //   }
7768   //
7769 
7770   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
7771 
7772   movl(jdx, idx);
7773   andl(jdx, 0xFFFFFFFC);
7774   shrl(jdx, 2);
7775 
7776   bind(L_third_loop);
7777   subl(jdx, 1);
7778   jcc(Assembler::negative, L_third_loop_exit);
7779   subl(idx, 4);
7780 
7781   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
7782   movq(carry2, rdx);
7783 
7784   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
7785   movq(carry, rdx);
7786   jmp(L_third_loop);
7787 
7788   bind (L_third_loop_exit);
7789 
7790   andl (idx, 0x3);
7791   jcc(Assembler::zero, L_post_third_loop_done);
7792 
7793   Label L_check_1;
7794   subl(idx, 2);
7795   jcc(Assembler::negative, L_check_1);
7796 
7797   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
7798   movq(carry, rdx);
7799 
7800   bind (L_check_1);
7801   addl (idx, 0x2);
7802   andl (idx, 0x1);
7803   subl(idx, 1);
7804   jcc(Assembler::negative, L_post_third_loop_done);
7805 
7806   movl(yz_idx, Address(y, idx, Address::times_4,  0));
7807   movq(product, x_xstart);
7808   mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
7809   movl(yz_idx, Address(z, idx, Address::times_4,  0));
7810 
7811   add2_with_carry(rdx, product, yz_idx, carry);
7812 
7813   movl(Address(z, idx, Address::times_4,  0), product);
7814   shrq(product, 32);
7815 
7816   shlq(rdx, 32);
7817   orq(product, rdx);
7818   movq(carry, product);
7819 
7820   bind(L_post_third_loop_done);
7821 }
7822 
7823 /**
7824  * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
7825  *
7826  */
7827 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
7828                                                   Register carry, Register carry2,
7829                                                   Register idx, Register jdx,
7830                                                   Register yz_idx1, Register yz_idx2,
7831                                                   Register tmp, Register tmp3, Register tmp4) {
7832   assert(UseBMI2Instructions, "should be used only when BMI2 is available");
7833 
7834   //   jlong carry, x[], y[], z[];
7835   //   int kdx = ystart+1;
7836   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
7837   //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
7838   //     jlong carry2  = (jlong)(tmp3 >>> 64);
7839   //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
7840   //     carry  = (jlong)(tmp4 >>> 64);
7841   //     z[kdx+idx+1] = (jlong)tmp3;
7842   //     z[kdx+idx] = (jlong)tmp4;
7843   //   }
7844   //   idx += 2;
7845   //   if (idx > 0) {
7846   //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
7847   //     z[kdx+idx] = (jlong)yz_idx1;
7848   //     carry  = (jlong)(yz_idx1 >>> 64);
7849   //   }
7850   //
7851 
7852   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
7853 
7854   movl(jdx, idx);
7855   andl(jdx, 0xFFFFFFFC);
7856   shrl(jdx, 2);
7857 
7858   bind(L_third_loop);
7859   subl(jdx, 1);
7860   jcc(Assembler::negative, L_third_loop_exit);
7861   subl(idx, 4);
7862 
7863   movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
7864   rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
7865   movq(yz_idx2, Address(y, idx, Address::times_4,  0));
7866   rorxq(yz_idx2, yz_idx2, 32);
7867 
7868   mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
7869   mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
7870 
7871   movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
7872   rorxq(yz_idx1, yz_idx1, 32);
7873   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
7874   rorxq(yz_idx2, yz_idx2, 32);
7875 
7876   if (VM_Version::supports_adx()) {
7877     adcxq(tmp3, carry);
7878     adoxq(tmp3, yz_idx1);
7879 
7880     adcxq(tmp4, tmp);
7881     adoxq(tmp4, yz_idx2);
7882 
7883     movl(carry, 0); // does not affect flags
7884     adcxq(carry2, carry);
7885     adoxq(carry2, carry);
7886   } else {
7887     add2_with_carry(tmp4, tmp3, carry, yz_idx1);
7888     add2_with_carry(carry2, tmp4, tmp, yz_idx2);
7889   }
7890   movq(carry, carry2);
7891 
7892   movl(Address(z, idx, Address::times_4, 12), tmp3);
7893   shrq(tmp3, 32);
7894   movl(Address(z, idx, Address::times_4,  8), tmp3);
7895 
7896   movl(Address(z, idx, Address::times_4,  4), tmp4);
7897   shrq(tmp4, 32);
7898   movl(Address(z, idx, Address::times_4,  0), tmp4);
7899 
7900   jmp(L_third_loop);
7901 
7902   bind (L_third_loop_exit);
7903 
7904   andl (idx, 0x3);
7905   jcc(Assembler::zero, L_post_third_loop_done);
7906 
7907   Label L_check_1;
7908   subl(idx, 2);
7909   jcc(Assembler::negative, L_check_1);
7910 
7911   movq(yz_idx1, Address(y, idx, Address::times_4,  0));
7912   rorxq(yz_idx1, yz_idx1, 32);
7913   mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
7914   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
7915   rorxq(yz_idx2, yz_idx2, 32);
7916 
7917   add2_with_carry(tmp4, tmp3, carry, yz_idx2);
7918 
7919   movl(Address(z, idx, Address::times_4,  4), tmp3);
7920   shrq(tmp3, 32);
7921   movl(Address(z, idx, Address::times_4,  0), tmp3);
7922   movq(carry, tmp4);
7923 
7924   bind (L_check_1);
7925   addl (idx, 0x2);
7926   andl (idx, 0x1);
7927   subl(idx, 1);
7928   jcc(Assembler::negative, L_post_third_loop_done);
7929   movl(tmp4, Address(y, idx, Address::times_4,  0));
7930   mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
7931   movl(tmp4, Address(z, idx, Address::times_4,  0));
7932 
7933   add2_with_carry(carry2, tmp3, tmp4, carry);
7934 
7935   movl(Address(z, idx, Address::times_4,  0), tmp3);
7936   shrq(tmp3, 32);
7937 
7938   shlq(carry2, 32);
7939   orq(tmp3, carry2);
7940   movq(carry, tmp3);
7941 
7942   bind(L_post_third_loop_done);
7943 }
7944 
7945 /**
7946  * Code for BigInteger::multiplyToLen() instrinsic.
7947  *
7948  * rdi: x
7949  * rax: xlen
7950  * rsi: y
7951  * rcx: ylen
7952  * r8:  z
7953  * r11: zlen
7954  * r12: tmp1
7955  * r13: tmp2
7956  * r14: tmp3
7957  * r15: tmp4
7958  * rbx: tmp5
7959  *
7960  */
7961 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
7962                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
7963   ShortBranchVerifier sbv(this);
7964   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
7965 
7966   push(tmp1);
7967   push(tmp2);
7968   push(tmp3);
7969   push(tmp4);
7970   push(tmp5);
7971 
7972   push(xlen);
7973   push(zlen);
7974 
7975   const Register idx = tmp1;
7976   const Register kdx = tmp2;
7977   const Register xstart = tmp3;
7978 
7979   const Register y_idx = tmp4;
7980   const Register carry = tmp5;
7981   const Register product  = xlen;
7982   const Register x_xstart = zlen;  // reuse register
7983 
7984   // First Loop.
7985   //
7986   //  final static long LONG_MASK = 0xffffffffL;
7987   //  int xstart = xlen - 1;
7988   //  int ystart = ylen - 1;
7989   //  long carry = 0;
7990   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
7991   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
7992   //    z[kdx] = (int)product;
7993   //    carry = product >>> 32;
7994   //  }
7995   //  z[xstart] = (int)carry;
7996   //
7997 
7998   movl(idx, ylen);      // idx = ylen;
7999   movl(kdx, zlen);      // kdx = xlen+ylen;
8000   xorq(carry, carry);   // carry = 0;
8001 
8002   Label L_done;
8003 
8004   movl(xstart, xlen);
8005   decrementl(xstart);
8006   jcc(Assembler::negative, L_done);
8007 
8008   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
8009 
8010   Label L_second_loop;
8011   testl(kdx, kdx);
8012   jcc(Assembler::zero, L_second_loop);
8013 
8014   Label L_carry;
8015   subl(kdx, 1);
8016   jcc(Assembler::zero, L_carry);
8017 
8018   movl(Address(z, kdx, Address::times_4,  0), carry);
8019   shrq(carry, 32);
8020   subl(kdx, 1);
8021 
8022   bind(L_carry);
8023   movl(Address(z, kdx, Address::times_4,  0), carry);
8024 
8025   // Second and third (nested) loops.
8026   //
8027   // for (int i = xstart-1; i >= 0; i--) { // Second loop
8028   //   carry = 0;
8029   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
8030   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
8031   //                    (z[k] & LONG_MASK) + carry;
8032   //     z[k] = (int)product;
8033   //     carry = product >>> 32;
8034   //   }
8035   //   z[i] = (int)carry;
8036   // }
8037   //
8038   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
8039 
8040   const Register jdx = tmp1;
8041 
8042   bind(L_second_loop);
8043   xorl(carry, carry);    // carry = 0;
8044   movl(jdx, ylen);       // j = ystart+1
8045 
8046   subl(xstart, 1);       // i = xstart-1;
8047   jcc(Assembler::negative, L_done);
8048 
8049   push (z);
8050 
8051   Label L_last_x;
8052   lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
8053   subl(xstart, 1);       // i = xstart-1;
8054   jcc(Assembler::negative, L_last_x);
8055 
8056   if (UseBMI2Instructions) {
8057     movq(rdx,  Address(x, xstart, Address::times_4,  0));
8058     rorxq(rdx, rdx, 32); // convert big-endian to little-endian
8059   } else {
8060     movq(x_xstart, Address(x, xstart, Address::times_4,  0));
8061     rorq(x_xstart, 32);  // convert big-endian to little-endian
8062   }
8063 
8064   Label L_third_loop_prologue;
8065   bind(L_third_loop_prologue);
8066 
8067   push (x);
8068   push (xstart);
8069   push (ylen);
8070 
8071 
8072   if (UseBMI2Instructions) {
8073     multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
8074   } else { // !UseBMI2Instructions
8075     multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
8076   }
8077 
8078   pop(ylen);
8079   pop(xlen);
8080   pop(x);
8081   pop(z);
8082 
8083   movl(tmp3, xlen);
8084   addl(tmp3, 1);
8085   movl(Address(z, tmp3, Address::times_4,  0), carry);
8086   subl(tmp3, 1);
8087   jccb(Assembler::negative, L_done);
8088 
8089   shrq(carry, 32);
8090   movl(Address(z, tmp3, Address::times_4,  0), carry);
8091   jmp(L_second_loop);
8092 
8093   // Next infrequent code is moved outside loops.
8094   bind(L_last_x);
8095   if (UseBMI2Instructions) {
8096     movl(rdx, Address(x,  0));
8097   } else {
8098     movl(x_xstart, Address(x,  0));
8099   }
8100   jmp(L_third_loop_prologue);
8101 
8102   bind(L_done);
8103 
8104   pop(zlen);
8105   pop(xlen);
8106 
8107   pop(tmp5);
8108   pop(tmp4);
8109   pop(tmp3);
8110   pop(tmp2);
8111   pop(tmp1);
8112 }
8113 
8114 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
8115   Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
8116   assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
8117   Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
8118   Label VECTOR8_TAIL, VECTOR4_TAIL;
8119   Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
8120   Label SAME_TILL_END, DONE;
8121   Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
8122 
8123   //scale is in rcx in both Win64 and Unix
8124   ShortBranchVerifier sbv(this);
8125 
8126   shlq(length);
8127   xorq(result, result);
8128 
8129   if ((UseAVX > 2) &&
8130       VM_Version::supports_avx512vlbw()) {
8131     Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
8132 
8133     cmpq(length, 64);
8134     jcc(Assembler::less, VECTOR32_TAIL);
8135     movq(tmp1, length);
8136     andq(tmp1, 0x3F);      // tail count
8137     andq(length, ~(0x3F)); //vector count
8138 
8139     bind(VECTOR64_LOOP);
8140     // AVX512 code to compare 64 byte vectors.
8141     evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
8142     evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
8143     kortestql(k7, k7);
8144     jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL);     // mismatch
8145     addq(result, 64);
8146     subq(length, 64);
8147     jccb(Assembler::notZero, VECTOR64_LOOP);
8148 
8149     //bind(VECTOR64_TAIL);
8150     testq(tmp1, tmp1);
8151     jcc(Assembler::zero, SAME_TILL_END);
8152 
8153     //bind(VECTOR64_TAIL);
8154     // AVX512 code to compare upto 63 byte vectors.
8155     mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
8156     shlxq(tmp2, tmp2, tmp1);
8157     notq(tmp2);
8158     kmovql(k3, tmp2);
8159 
8160     evmovdqub(rymm0, k3, Address(obja, result), Assembler::AVX_512bit);
8161     evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
8162 
8163     ktestql(k7, k3);
8164     jcc(Assembler::below, SAME_TILL_END);     // not mismatch
8165 
8166     bind(VECTOR64_NOT_EQUAL);
8167     kmovql(tmp1, k7);
8168     notq(tmp1);
8169     tzcntq(tmp1, tmp1);
8170     addq(result, tmp1);
8171     shrq(result);
8172     jmp(DONE);
8173     bind(VECTOR32_TAIL);
8174   }
8175 
8176   cmpq(length, 8);
8177   jcc(Assembler::equal, VECTOR8_LOOP);
8178   jcc(Assembler::less, VECTOR4_TAIL);
8179 
8180   if (UseAVX >= 2) {
8181     Label VECTOR16_TAIL, VECTOR32_LOOP;
8182 
8183     cmpq(length, 16);
8184     jcc(Assembler::equal, VECTOR16_LOOP);
8185     jcc(Assembler::less, VECTOR8_LOOP);
8186 
8187     cmpq(length, 32);
8188     jccb(Assembler::less, VECTOR16_TAIL);
8189 
8190     subq(length, 32);
8191     bind(VECTOR32_LOOP);
8192     vmovdqu(rymm0, Address(obja, result));
8193     vmovdqu(rymm1, Address(objb, result));
8194     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
8195     vptest(rymm2, rymm2);
8196     jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
8197     addq(result, 32);
8198     subq(length, 32);
8199     jcc(Assembler::greaterEqual, VECTOR32_LOOP);
8200     addq(length, 32);
8201     jcc(Assembler::equal, SAME_TILL_END);
8202     //falling through if less than 32 bytes left //close the branch here.
8203 
8204     bind(VECTOR16_TAIL);
8205     cmpq(length, 16);
8206     jccb(Assembler::less, VECTOR8_TAIL);
8207     bind(VECTOR16_LOOP);
8208     movdqu(rymm0, Address(obja, result));
8209     movdqu(rymm1, Address(objb, result));
8210     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
8211     ptest(rymm2, rymm2);
8212     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
8213     addq(result, 16);
8214     subq(length, 16);
8215     jcc(Assembler::equal, SAME_TILL_END);
8216     //falling through if less than 16 bytes left
8217   } else {//regular intrinsics
8218 
8219     cmpq(length, 16);
8220     jccb(Assembler::less, VECTOR8_TAIL);
8221 
8222     subq(length, 16);
8223     bind(VECTOR16_LOOP);
8224     movdqu(rymm0, Address(obja, result));
8225     movdqu(rymm1, Address(objb, result));
8226     pxor(rymm0, rymm1);
8227     ptest(rymm0, rymm0);
8228     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
8229     addq(result, 16);
8230     subq(length, 16);
8231     jccb(Assembler::greaterEqual, VECTOR16_LOOP);
8232     addq(length, 16);
8233     jcc(Assembler::equal, SAME_TILL_END);
8234     //falling through if less than 16 bytes left
8235   }
8236 
8237   bind(VECTOR8_TAIL);
8238   cmpq(length, 8);
8239   jccb(Assembler::less, VECTOR4_TAIL);
8240   bind(VECTOR8_LOOP);
8241   movq(tmp1, Address(obja, result));
8242   movq(tmp2, Address(objb, result));
8243   xorq(tmp1, tmp2);
8244   testq(tmp1, tmp1);
8245   jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
8246   addq(result, 8);
8247   subq(length, 8);
8248   jcc(Assembler::equal, SAME_TILL_END);
8249   //falling through if less than 8 bytes left
8250 
8251   bind(VECTOR4_TAIL);
8252   cmpq(length, 4);
8253   jccb(Assembler::less, BYTES_TAIL);
8254   bind(VECTOR4_LOOP);
8255   movl(tmp1, Address(obja, result));
8256   xorl(tmp1, Address(objb, result));
8257   testl(tmp1, tmp1);
8258   jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
8259   addq(result, 4);
8260   subq(length, 4);
8261   jcc(Assembler::equal, SAME_TILL_END);
8262   //falling through if less than 4 bytes left
8263 
8264   bind(BYTES_TAIL);
8265   bind(BYTES_LOOP);
8266   load_unsigned_byte(tmp1, Address(obja, result));
8267   load_unsigned_byte(tmp2, Address(objb, result));
8268   xorl(tmp1, tmp2);
8269   testl(tmp1, tmp1);
8270   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
8271   decq(length);
8272   jcc(Assembler::zero, SAME_TILL_END);
8273   incq(result);
8274   load_unsigned_byte(tmp1, Address(obja, result));
8275   load_unsigned_byte(tmp2, Address(objb, result));
8276   xorl(tmp1, tmp2);
8277   testl(tmp1, tmp1);
8278   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
8279   decq(length);
8280   jcc(Assembler::zero, SAME_TILL_END);
8281   incq(result);
8282   load_unsigned_byte(tmp1, Address(obja, result));
8283   load_unsigned_byte(tmp2, Address(objb, result));
8284   xorl(tmp1, tmp2);
8285   testl(tmp1, tmp1);
8286   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
8287   jmp(SAME_TILL_END);
8288 
8289   if (UseAVX >= 2) {
8290     bind(VECTOR32_NOT_EQUAL);
8291     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
8292     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
8293     vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
8294     vpmovmskb(tmp1, rymm0);
8295     bsfq(tmp1, tmp1);
8296     addq(result, tmp1);
8297     shrq(result);
8298     jmp(DONE);
8299   }
8300 
8301   bind(VECTOR16_NOT_EQUAL);
8302   if (UseAVX >= 2) {
8303     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
8304     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
8305     pxor(rymm0, rymm2);
8306   } else {
8307     pcmpeqb(rymm2, rymm2);
8308     pxor(rymm0, rymm1);
8309     pcmpeqb(rymm0, rymm1);
8310     pxor(rymm0, rymm2);
8311   }
8312   pmovmskb(tmp1, rymm0);
8313   bsfq(tmp1, tmp1);
8314   addq(result, tmp1);
8315   shrq(result);
8316   jmpb(DONE);
8317 
8318   bind(VECTOR8_NOT_EQUAL);
8319   bind(VECTOR4_NOT_EQUAL);
8320   bsfq(tmp1, tmp1);
8321   shrq(tmp1, 3);
8322   addq(result, tmp1);
8323   bind(BYTES_NOT_EQUAL);
8324   shrq(result);
8325   jmpb(DONE);
8326 
8327   bind(SAME_TILL_END);
8328   mov64(result, -1);
8329 
8330   bind(DONE);
8331 }
8332 
8333 //Helper functions for square_to_len()
8334 
8335 /**
8336  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
8337  * Preserves x and z and modifies rest of the registers.
8338  */
8339 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8340   // Perform square and right shift by 1
8341   // Handle odd xlen case first, then for even xlen do the following
8342   // jlong carry = 0;
8343   // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
8344   //     huge_128 product = x[j:j+1] * x[j:j+1];
8345   //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
8346   //     z[i+2:i+3] = (jlong)(product >>> 1);
8347   //     carry = (jlong)product;
8348   // }
8349 
8350   xorq(tmp5, tmp5);     // carry
8351   xorq(rdxReg, rdxReg);
8352   xorl(tmp1, tmp1);     // index for x
8353   xorl(tmp4, tmp4);     // index for z
8354 
8355   Label L_first_loop, L_first_loop_exit;
8356 
8357   testl(xlen, 1);
8358   jccb(Assembler::zero, L_first_loop); //jump if xlen is even
8359 
8360   // Square and right shift by 1 the odd element using 32 bit multiply
8361   movl(raxReg, Address(x, tmp1, Address::times_4, 0));
8362   imulq(raxReg, raxReg);
8363   shrq(raxReg, 1);
8364   adcq(tmp5, 0);
8365   movq(Address(z, tmp4, Address::times_4, 0), raxReg);
8366   incrementl(tmp1);
8367   addl(tmp4, 2);
8368 
8369   // Square and  right shift by 1 the rest using 64 bit multiply
8370   bind(L_first_loop);
8371   cmpptr(tmp1, xlen);
8372   jccb(Assembler::equal, L_first_loop_exit);
8373 
8374   // Square
8375   movq(raxReg, Address(x, tmp1, Address::times_4,  0));
8376   rorq(raxReg, 32);    // convert big-endian to little-endian
8377   mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
8378 
8379   // Right shift by 1 and save carry
8380   shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
8381   rcrq(rdxReg, 1);
8382   rcrq(raxReg, 1);
8383   adcq(tmp5, 0);
8384 
8385   // Store result in z
8386   movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
8387   movq(Address(z, tmp4, Address::times_4, 8), raxReg);
8388 
8389   // Update indices for x and z
8390   addl(tmp1, 2);
8391   addl(tmp4, 4);
8392   jmp(L_first_loop);
8393 
8394   bind(L_first_loop_exit);
8395 }
8396 
8397 
8398 /**
8399  * Perform the following multiply add operation using BMI2 instructions
8400  * carry:sum = sum + op1*op2 + carry
8401  * op2 should be in rdx
8402  * op2 is preserved, all other registers are modified
8403  */
8404 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
8405   // assert op2 is rdx
8406   mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
8407   addq(sum, carry);
8408   adcq(tmp2, 0);
8409   addq(sum, op1);
8410   adcq(tmp2, 0);
8411   movq(carry, tmp2);
8412 }
8413 
8414 /**
8415  * Perform the following multiply add operation:
8416  * carry:sum = sum + op1*op2 + carry
8417  * Preserves op1, op2 and modifies rest of registers
8418  */
8419 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
8420   // rdx:rax = op1 * op2
8421   movq(raxReg, op2);
8422   mulq(op1);
8423 
8424   //  rdx:rax = sum + carry + rdx:rax
8425   addq(sum, carry);
8426   adcq(rdxReg, 0);
8427   addq(sum, raxReg);
8428   adcq(rdxReg, 0);
8429 
8430   // carry:sum = rdx:sum
8431   movq(carry, rdxReg);
8432 }
8433 
8434 /**
8435  * Add 64 bit long carry into z[] with carry propogation.
8436  * Preserves z and carry register values and modifies rest of registers.
8437  *
8438  */
8439 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
8440   Label L_fourth_loop, L_fourth_loop_exit;
8441 
8442   movl(tmp1, 1);
8443   subl(zlen, 2);
8444   addq(Address(z, zlen, Address::times_4, 0), carry);
8445 
8446   bind(L_fourth_loop);
8447   jccb(Assembler::carryClear, L_fourth_loop_exit);
8448   subl(zlen, 2);
8449   jccb(Assembler::negative, L_fourth_loop_exit);
8450   addq(Address(z, zlen, Address::times_4, 0), tmp1);
8451   jmp(L_fourth_loop);
8452   bind(L_fourth_loop_exit);
8453 }
8454 
8455 /**
8456  * Shift z[] left by 1 bit.
8457  * Preserves x, len, z and zlen registers and modifies rest of the registers.
8458  *
8459  */
8460 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
8461 
8462   Label L_fifth_loop, L_fifth_loop_exit;
8463 
8464   // Fifth loop
8465   // Perform primitiveLeftShift(z, zlen, 1)
8466 
8467   const Register prev_carry = tmp1;
8468   const Register new_carry = tmp4;
8469   const Register value = tmp2;
8470   const Register zidx = tmp3;
8471 
8472   // int zidx, carry;
8473   // long value;
8474   // carry = 0;
8475   // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
8476   //    (carry:value)  = (z[i] << 1) | carry ;
8477   //    z[i] = value;
8478   // }
8479 
8480   movl(zidx, zlen);
8481   xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
8482 
8483   bind(L_fifth_loop);
8484   decl(zidx);  // Use decl to preserve carry flag
8485   decl(zidx);
8486   jccb(Assembler::negative, L_fifth_loop_exit);
8487 
8488   if (UseBMI2Instructions) {
8489      movq(value, Address(z, zidx, Address::times_4, 0));
8490      rclq(value, 1);
8491      rorxq(value, value, 32);
8492      movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
8493   }
8494   else {
8495     // clear new_carry
8496     xorl(new_carry, new_carry);
8497 
8498     // Shift z[i] by 1, or in previous carry and save new carry
8499     movq(value, Address(z, zidx, Address::times_4, 0));
8500     shlq(value, 1);
8501     adcl(new_carry, 0);
8502 
8503     orq(value, prev_carry);
8504     rorq(value, 0x20);
8505     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
8506 
8507     // Set previous carry = new carry
8508     movl(prev_carry, new_carry);
8509   }
8510   jmp(L_fifth_loop);
8511 
8512   bind(L_fifth_loop_exit);
8513 }
8514 
8515 
8516 /**
8517  * Code for BigInteger::squareToLen() intrinsic
8518  *
8519  * rdi: x
8520  * rsi: len
8521  * r8:  z
8522  * rcx: zlen
8523  * r12: tmp1
8524  * r13: tmp2
8525  * r14: tmp3
8526  * r15: tmp4
8527  * rbx: tmp5
8528  *
8529  */
8530 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8531 
8532   Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
8533   push(tmp1);
8534   push(tmp2);
8535   push(tmp3);
8536   push(tmp4);
8537   push(tmp5);
8538 
8539   // First loop
8540   // Store the squares, right shifted one bit (i.e., divided by 2).
8541   square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
8542 
8543   // Add in off-diagonal sums.
8544   //
8545   // Second, third (nested) and fourth loops.
8546   // zlen +=2;
8547   // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
8548   //    carry = 0;
8549   //    long op2 = x[xidx:xidx+1];
8550   //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
8551   //       k -= 2;
8552   //       long op1 = x[j:j+1];
8553   //       long sum = z[k:k+1];
8554   //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
8555   //       z[k:k+1] = sum;
8556   //    }
8557   //    add_one_64(z, k, carry, tmp_regs);
8558   // }
8559 
8560   const Register carry = tmp5;
8561   const Register sum = tmp3;
8562   const Register op1 = tmp4;
8563   Register op2 = tmp2;
8564 
8565   push(zlen);
8566   push(len);
8567   addl(zlen,2);
8568   bind(L_second_loop);
8569   xorq(carry, carry);
8570   subl(zlen, 4);
8571   subl(len, 2);
8572   push(zlen);
8573   push(len);
8574   cmpl(len, 0);
8575   jccb(Assembler::lessEqual, L_second_loop_exit);
8576 
8577   // Multiply an array by one 64 bit long.
8578   if (UseBMI2Instructions) {
8579     op2 = rdxReg;
8580     movq(op2, Address(x, len, Address::times_4,  0));
8581     rorxq(op2, op2, 32);
8582   }
8583   else {
8584     movq(op2, Address(x, len, Address::times_4,  0));
8585     rorq(op2, 32);
8586   }
8587 
8588   bind(L_third_loop);
8589   decrementl(len);
8590   jccb(Assembler::negative, L_third_loop_exit);
8591   decrementl(len);
8592   jccb(Assembler::negative, L_last_x);
8593 
8594   movq(op1, Address(x, len, Address::times_4,  0));
8595   rorq(op1, 32);
8596 
8597   bind(L_multiply);
8598   subl(zlen, 2);
8599   movq(sum, Address(z, zlen, Address::times_4,  0));
8600 
8601   // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
8602   if (UseBMI2Instructions) {
8603     multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
8604   }
8605   else {
8606     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8607   }
8608 
8609   movq(Address(z, zlen, Address::times_4, 0), sum);
8610 
8611   jmp(L_third_loop);
8612   bind(L_third_loop_exit);
8613 
8614   // Fourth loop
8615   // Add 64 bit long carry into z with carry propogation.
8616   // Uses offsetted zlen.
8617   add_one_64(z, zlen, carry, tmp1);
8618 
8619   pop(len);
8620   pop(zlen);
8621   jmp(L_second_loop);
8622 
8623   // Next infrequent code is moved outside loops.
8624   bind(L_last_x);
8625   movl(op1, Address(x, 0));
8626   jmp(L_multiply);
8627 
8628   bind(L_second_loop_exit);
8629   pop(len);
8630   pop(zlen);
8631   pop(len);
8632   pop(zlen);
8633 
8634   // Fifth loop
8635   // Shift z left 1 bit.
8636   lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
8637 
8638   // z[zlen-1] |= x[len-1] & 1;
8639   movl(tmp3, Address(x, len, Address::times_4, -4));
8640   andl(tmp3, 1);
8641   orl(Address(z, zlen, Address::times_4,  -4), tmp3);
8642 
8643   pop(tmp5);
8644   pop(tmp4);
8645   pop(tmp3);
8646   pop(tmp2);
8647   pop(tmp1);
8648 }
8649 
8650 /**
8651  * Helper function for mul_add()
8652  * Multiply the in[] by int k and add to out[] starting at offset offs using
8653  * 128 bit by 32 bit multiply and return the carry in tmp5.
8654  * Only quad int aligned length of in[] is operated on in this function.
8655  * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
8656  * This function preserves out, in and k registers.
8657  * len and offset point to the appropriate index in "in" & "out" correspondingly
8658  * tmp5 has the carry.
8659  * other registers are temporary and are modified.
8660  *
8661  */
8662 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
8663   Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
8664   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8665 
8666   Label L_first_loop, L_first_loop_exit;
8667 
8668   movl(tmp1, len);
8669   shrl(tmp1, 2);
8670 
8671   bind(L_first_loop);
8672   subl(tmp1, 1);
8673   jccb(Assembler::negative, L_first_loop_exit);
8674 
8675   subl(len, 4);
8676   subl(offset, 4);
8677 
8678   Register op2 = tmp2;
8679   const Register sum = tmp3;
8680   const Register op1 = tmp4;
8681   const Register carry = tmp5;
8682 
8683   if (UseBMI2Instructions) {
8684     op2 = rdxReg;
8685   }
8686 
8687   movq(op1, Address(in, len, Address::times_4,  8));
8688   rorq(op1, 32);
8689   movq(sum, Address(out, offset, Address::times_4,  8));
8690   rorq(sum, 32);
8691   if (UseBMI2Instructions) {
8692     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8693   }
8694   else {
8695     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8696   }
8697   // Store back in big endian from little endian
8698   rorq(sum, 0x20);
8699   movq(Address(out, offset, Address::times_4,  8), sum);
8700 
8701   movq(op1, Address(in, len, Address::times_4,  0));
8702   rorq(op1, 32);
8703   movq(sum, Address(out, offset, Address::times_4,  0));
8704   rorq(sum, 32);
8705   if (UseBMI2Instructions) {
8706     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8707   }
8708   else {
8709     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8710   }
8711   // Store back in big endian from little endian
8712   rorq(sum, 0x20);
8713   movq(Address(out, offset, Address::times_4,  0), sum);
8714 
8715   jmp(L_first_loop);
8716   bind(L_first_loop_exit);
8717 }
8718 
8719 /**
8720  * Code for BigInteger::mulAdd() intrinsic
8721  *
8722  * rdi: out
8723  * rsi: in
8724  * r11: offs (out.length - offset)
8725  * rcx: len
8726  * r8:  k
8727  * r12: tmp1
8728  * r13: tmp2
8729  * r14: tmp3
8730  * r15: tmp4
8731  * rbx: tmp5
8732  * Multiply the in[] by word k and add to out[], return the carry in rax
8733  */
8734 void MacroAssembler::mul_add(Register out, Register in, Register offs,
8735    Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
8736    Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8737 
8738   Label L_carry, L_last_in, L_done;
8739 
8740 // carry = 0;
8741 // for (int j=len-1; j >= 0; j--) {
8742 //    long product = (in[j] & LONG_MASK) * kLong +
8743 //                   (out[offs] & LONG_MASK) + carry;
8744 //    out[offs--] = (int)product;
8745 //    carry = product >>> 32;
8746 // }
8747 //
8748   push(tmp1);
8749   push(tmp2);
8750   push(tmp3);
8751   push(tmp4);
8752   push(tmp5);
8753 
8754   Register op2 = tmp2;
8755   const Register sum = tmp3;
8756   const Register op1 = tmp4;
8757   const Register carry =  tmp5;
8758 
8759   if (UseBMI2Instructions) {
8760     op2 = rdxReg;
8761     movl(op2, k);
8762   }
8763   else {
8764     movl(op2, k);
8765   }
8766 
8767   xorq(carry, carry);
8768 
8769   //First loop
8770 
8771   //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
8772   //The carry is in tmp5
8773   mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
8774 
8775   //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
8776   decrementl(len);
8777   jccb(Assembler::negative, L_carry);
8778   decrementl(len);
8779   jccb(Assembler::negative, L_last_in);
8780 
8781   movq(op1, Address(in, len, Address::times_4,  0));
8782   rorq(op1, 32);
8783 
8784   subl(offs, 2);
8785   movq(sum, Address(out, offs, Address::times_4,  0));
8786   rorq(sum, 32);
8787 
8788   if (UseBMI2Instructions) {
8789     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8790   }
8791   else {
8792     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8793   }
8794 
8795   // Store back in big endian from little endian
8796   rorq(sum, 0x20);
8797   movq(Address(out, offs, Address::times_4,  0), sum);
8798 
8799   testl(len, len);
8800   jccb(Assembler::zero, L_carry);
8801 
8802   //Multiply the last in[] entry, if any
8803   bind(L_last_in);
8804   movl(op1, Address(in, 0));
8805   movl(sum, Address(out, offs, Address::times_4,  -4));
8806 
8807   movl(raxReg, k);
8808   mull(op1); //tmp4 * eax -> edx:eax
8809   addl(sum, carry);
8810   adcl(rdxReg, 0);
8811   addl(sum, raxReg);
8812   adcl(rdxReg, 0);
8813   movl(carry, rdxReg);
8814 
8815   movl(Address(out, offs, Address::times_4,  -4), sum);
8816 
8817   bind(L_carry);
8818   //return tmp5/carry as carry in rax
8819   movl(rax, carry);
8820 
8821   bind(L_done);
8822   pop(tmp5);
8823   pop(tmp4);
8824   pop(tmp3);
8825   pop(tmp2);
8826   pop(tmp1);
8827 }
8828 #endif
8829 
8830 /**
8831  * Emits code to update CRC-32 with a byte value according to constants in table
8832  *
8833  * @param [in,out]crc   Register containing the crc.
8834  * @param [in]val       Register containing the byte to fold into the CRC.
8835  * @param [in]table     Register containing the table of crc constants.
8836  *
8837  * uint32_t crc;
8838  * val = crc_table[(val ^ crc) & 0xFF];
8839  * crc = val ^ (crc >> 8);
8840  *
8841  */
8842 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
8843   xorl(val, crc);
8844   andl(val, 0xFF);
8845   shrl(crc, 8); // unsigned shift
8846   xorl(crc, Address(table, val, Address::times_4, 0));
8847 }
8848 
8849 /**
8850 * Fold four 128-bit data chunks
8851 */
8852 void MacroAssembler::fold_128bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
8853   evpclmulhdq(xtmp, xK, xcrc, Assembler::AVX_512bit); // [123:64]
8854   evpclmulldq(xcrc, xK, xcrc, Assembler::AVX_512bit); // [63:0]
8855   evpxorq(xcrc, xcrc, Address(buf, offset), Assembler::AVX_512bit /* vector_len */);
8856   evpxorq(xcrc, xcrc, xtmp, Assembler::AVX_512bit /* vector_len */);
8857 }
8858 
8859 /**
8860  * Fold 128-bit data chunk
8861  */
8862 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
8863   if (UseAVX > 0) {
8864     vpclmulhdq(xtmp, xK, xcrc); // [123:64]
8865     vpclmulldq(xcrc, xK, xcrc); // [63:0]
8866     vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
8867     pxor(xcrc, xtmp);
8868   } else {
8869     movdqa(xtmp, xcrc);
8870     pclmulhdq(xtmp, xK);   // [123:64]
8871     pclmulldq(xcrc, xK);   // [63:0]
8872     pxor(xcrc, xtmp);
8873     movdqu(xtmp, Address(buf, offset));
8874     pxor(xcrc, xtmp);
8875   }
8876 }
8877 
8878 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
8879   if (UseAVX > 0) {
8880     vpclmulhdq(xtmp, xK, xcrc);
8881     vpclmulldq(xcrc, xK, xcrc);
8882     pxor(xcrc, xbuf);
8883     pxor(xcrc, xtmp);
8884   } else {
8885     movdqa(xtmp, xcrc);
8886     pclmulhdq(xtmp, xK);
8887     pclmulldq(xcrc, xK);
8888     pxor(xcrc, xbuf);
8889     pxor(xcrc, xtmp);
8890   }
8891 }
8892 
8893 /**
8894  * 8-bit folds to compute 32-bit CRC
8895  *
8896  * uint64_t xcrc;
8897  * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
8898  */
8899 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
8900   movdl(tmp, xcrc);
8901   andl(tmp, 0xFF);
8902   movdl(xtmp, Address(table, tmp, Address::times_4, 0));
8903   psrldq(xcrc, 1); // unsigned shift one byte
8904   pxor(xcrc, xtmp);
8905 }
8906 
8907 /**
8908  * uint32_t crc;
8909  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
8910  */
8911 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
8912   movl(tmp, crc);
8913   andl(tmp, 0xFF);
8914   shrl(crc, 8);
8915   xorl(crc, Address(table, tmp, Address::times_4, 0));
8916 }
8917 
8918 /**
8919  * @param crc   register containing existing CRC (32-bit)
8920  * @param buf   register pointing to input byte buffer (byte*)
8921  * @param len   register containing number of bytes
8922  * @param table register that will contain address of CRC table
8923  * @param tmp   scratch register
8924  */
8925 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
8926   assert_different_registers(crc, buf, len, table, tmp, rax);
8927 
8928   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
8929   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
8930 
8931   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
8932   // context for the registers used, where all instructions below are using 128-bit mode
8933   // On EVEX without VL and BW, these instructions will all be AVX.
8934   lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
8935   notl(crc); // ~crc
8936   cmpl(len, 16);
8937   jcc(Assembler::less, L_tail);
8938 
8939   // Align buffer to 16 bytes
8940   movl(tmp, buf);
8941   andl(tmp, 0xF);
8942   jccb(Assembler::zero, L_aligned);
8943   subl(tmp,  16);
8944   addl(len, tmp);
8945 
8946   align(4);
8947   BIND(L_align_loop);
8948   movsbl(rax, Address(buf, 0)); // load byte with sign extension
8949   update_byte_crc32(crc, rax, table);
8950   increment(buf);
8951   incrementl(tmp);
8952   jccb(Assembler::less, L_align_loop);
8953 
8954   BIND(L_aligned);
8955   movl(tmp, len); // save
8956   shrl(len, 4);
8957   jcc(Assembler::zero, L_tail_restore);
8958 
8959   // Fold total 512 bits of polynomial on each iteration
8960   if (VM_Version::supports_vpclmulqdq()) {
8961     Label Parallel_loop, L_No_Parallel;
8962 
8963     cmpl(len, 8);
8964     jccb(Assembler::less, L_No_Parallel);
8965 
8966     movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
8967     evmovdquq(xmm1, Address(buf, 0), Assembler::AVX_512bit);
8968     movdl(xmm5, crc);
8969     evpxorq(xmm1, xmm1, xmm5, Assembler::AVX_512bit);
8970     addptr(buf, 64);
8971     subl(len, 7);
8972     evshufi64x2(xmm0, xmm0, xmm0, 0x00, Assembler::AVX_512bit); //propagate the mask from 128 bits to 512 bits
8973 
8974     BIND(Parallel_loop);
8975     fold_128bit_crc32_avx512(xmm1, xmm0, xmm5, buf, 0);
8976     addptr(buf, 64);
8977     subl(len, 4);
8978     jcc(Assembler::greater, Parallel_loop);
8979 
8980     vextracti64x2(xmm2, xmm1, 0x01);
8981     vextracti64x2(xmm3, xmm1, 0x02);
8982     vextracti64x2(xmm4, xmm1, 0x03);
8983     jmp(L_fold_512b);
8984 
8985     BIND(L_No_Parallel);
8986   }
8987   // Fold crc into first bytes of vector
8988   movdqa(xmm1, Address(buf, 0));
8989   movdl(rax, xmm1);
8990   xorl(crc, rax);
8991   if (VM_Version::supports_sse4_1()) {
8992     pinsrd(xmm1, crc, 0);
8993   } else {
8994     pinsrw(xmm1, crc, 0);
8995     shrl(crc, 16);
8996     pinsrw(xmm1, crc, 1);
8997   }
8998   addptr(buf, 16);
8999   subl(len, 4); // len > 0
9000   jcc(Assembler::less, L_fold_tail);
9001 
9002   movdqa(xmm2, Address(buf,  0));
9003   movdqa(xmm3, Address(buf, 16));
9004   movdqa(xmm4, Address(buf, 32));
9005   addptr(buf, 48);
9006   subl(len, 3);
9007   jcc(Assembler::lessEqual, L_fold_512b);
9008 
9009   // Fold total 512 bits of polynomial on each iteration,
9010   // 128 bits per each of 4 parallel streams.
9011   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
9012 
9013   align(32);
9014   BIND(L_fold_512b_loop);
9015   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
9016   fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
9017   fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
9018   fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
9019   addptr(buf, 64);
9020   subl(len, 4);
9021   jcc(Assembler::greater, L_fold_512b_loop);
9022 
9023   // Fold 512 bits to 128 bits.
9024   BIND(L_fold_512b);
9025   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
9026   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
9027   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
9028   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
9029 
9030   // Fold the rest of 128 bits data chunks
9031   BIND(L_fold_tail);
9032   addl(len, 3);
9033   jccb(Assembler::lessEqual, L_fold_128b);
9034   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
9035 
9036   BIND(L_fold_tail_loop);
9037   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
9038   addptr(buf, 16);
9039   decrementl(len);
9040   jccb(Assembler::greater, L_fold_tail_loop);
9041 
9042   // Fold 128 bits in xmm1 down into 32 bits in crc register.
9043   BIND(L_fold_128b);
9044   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
9045   if (UseAVX > 0) {
9046     vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
9047     vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
9048     vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
9049   } else {
9050     movdqa(xmm2, xmm0);
9051     pclmulqdq(xmm2, xmm1, 0x1);
9052     movdqa(xmm3, xmm0);
9053     pand(xmm3, xmm2);
9054     pclmulqdq(xmm0, xmm3, 0x1);
9055   }
9056   psrldq(xmm1, 8);
9057   psrldq(xmm2, 4);
9058   pxor(xmm0, xmm1);
9059   pxor(xmm0, xmm2);
9060 
9061   // 8 8-bit folds to compute 32-bit CRC.
9062   for (int j = 0; j < 4; j++) {
9063     fold_8bit_crc32(xmm0, table, xmm1, rax);
9064   }
9065   movdl(crc, xmm0); // mov 32 bits to general register
9066   for (int j = 0; j < 4; j++) {
9067     fold_8bit_crc32(crc, table, rax);
9068   }
9069 
9070   BIND(L_tail_restore);
9071   movl(len, tmp); // restore
9072   BIND(L_tail);
9073   andl(len, 0xf);
9074   jccb(Assembler::zero, L_exit);
9075 
9076   // Fold the rest of bytes
9077   align(4);
9078   BIND(L_tail_loop);
9079   movsbl(rax, Address(buf, 0)); // load byte with sign extension
9080   update_byte_crc32(crc, rax, table);
9081   increment(buf);
9082   decrementl(len);
9083   jccb(Assembler::greater, L_tail_loop);
9084 
9085   BIND(L_exit);
9086   notl(crc); // ~c
9087 }
9088 
9089 #ifdef _LP64
9090 // S. Gueron / Information Processing Letters 112 (2012) 184
9091 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
9092 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
9093 // Output: the 64-bit carry-less product of B * CONST
9094 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
9095                                      Register tmp1, Register tmp2, Register tmp3) {
9096   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
9097   if (n > 0) {
9098     addq(tmp3, n * 256 * 8);
9099   }
9100   //    Q1 = TABLEExt[n][B & 0xFF];
9101   movl(tmp1, in);
9102   andl(tmp1, 0x000000FF);
9103   shll(tmp1, 3);
9104   addq(tmp1, tmp3);
9105   movq(tmp1, Address(tmp1, 0));
9106 
9107   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
9108   movl(tmp2, in);
9109   shrl(tmp2, 8);
9110   andl(tmp2, 0x000000FF);
9111   shll(tmp2, 3);
9112   addq(tmp2, tmp3);
9113   movq(tmp2, Address(tmp2, 0));
9114 
9115   shlq(tmp2, 8);
9116   xorq(tmp1, tmp2);
9117 
9118   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
9119   movl(tmp2, in);
9120   shrl(tmp2, 16);
9121   andl(tmp2, 0x000000FF);
9122   shll(tmp2, 3);
9123   addq(tmp2, tmp3);
9124   movq(tmp2, Address(tmp2, 0));
9125 
9126   shlq(tmp2, 16);
9127   xorq(tmp1, tmp2);
9128 
9129   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
9130   shrl(in, 24);
9131   andl(in, 0x000000FF);
9132   shll(in, 3);
9133   addq(in, tmp3);
9134   movq(in, Address(in, 0));
9135 
9136   shlq(in, 24);
9137   xorq(in, tmp1);
9138   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
9139 }
9140 
9141 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
9142                                       Register in_out,
9143                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
9144                                       XMMRegister w_xtmp2,
9145                                       Register tmp1,
9146                                       Register n_tmp2, Register n_tmp3) {
9147   if (is_pclmulqdq_supported) {
9148     movdl(w_xtmp1, in_out); // modified blindly
9149 
9150     movl(tmp1, const_or_pre_comp_const_index);
9151     movdl(w_xtmp2, tmp1);
9152     pclmulqdq(w_xtmp1, w_xtmp2, 0);
9153 
9154     movdq(in_out, w_xtmp1);
9155   } else {
9156     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
9157   }
9158 }
9159 
9160 // Recombination Alternative 2: No bit-reflections
9161 // T1 = (CRC_A * U1) << 1
9162 // T2 = (CRC_B * U2) << 1
9163 // C1 = T1 >> 32
9164 // C2 = T2 >> 32
9165 // T1 = T1 & 0xFFFFFFFF
9166 // T2 = T2 & 0xFFFFFFFF
9167 // T1 = CRC32(0, T1)
9168 // T2 = CRC32(0, T2)
9169 // C1 = C1 ^ T1
9170 // C2 = C2 ^ T2
9171 // CRC = C1 ^ C2 ^ CRC_C
9172 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
9173                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9174                                      Register tmp1, Register tmp2,
9175                                      Register n_tmp3) {
9176   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9177   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9178   shlq(in_out, 1);
9179   movl(tmp1, in_out);
9180   shrq(in_out, 32);
9181   xorl(tmp2, tmp2);
9182   crc32(tmp2, tmp1, 4);
9183   xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
9184   shlq(in1, 1);
9185   movl(tmp1, in1);
9186   shrq(in1, 32);
9187   xorl(tmp2, tmp2);
9188   crc32(tmp2, tmp1, 4);
9189   xorl(in1, tmp2);
9190   xorl(in_out, in1);
9191   xorl(in_out, in2);
9192 }
9193 
9194 // Set N to predefined value
9195 // Subtract from a lenght of a buffer
9196 // execute in a loop:
9197 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
9198 // for i = 1 to N do
9199 //  CRC_A = CRC32(CRC_A, A[i])
9200 //  CRC_B = CRC32(CRC_B, B[i])
9201 //  CRC_C = CRC32(CRC_C, C[i])
9202 // end for
9203 // Recombine
9204 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
9205                                        Register in_out1, Register in_out2, Register in_out3,
9206                                        Register tmp1, Register tmp2, Register tmp3,
9207                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9208                                        Register tmp4, Register tmp5,
9209                                        Register n_tmp6) {
9210   Label L_processPartitions;
9211   Label L_processPartition;
9212   Label L_exit;
9213 
9214   bind(L_processPartitions);
9215   cmpl(in_out1, 3 * size);
9216   jcc(Assembler::less, L_exit);
9217     xorl(tmp1, tmp1);
9218     xorl(tmp2, tmp2);
9219     movq(tmp3, in_out2);
9220     addq(tmp3, size);
9221 
9222     bind(L_processPartition);
9223       crc32(in_out3, Address(in_out2, 0), 8);
9224       crc32(tmp1, Address(in_out2, size), 8);
9225       crc32(tmp2, Address(in_out2, size * 2), 8);
9226       addq(in_out2, 8);
9227       cmpq(in_out2, tmp3);
9228       jcc(Assembler::less, L_processPartition);
9229     crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
9230             w_xtmp1, w_xtmp2, w_xtmp3,
9231             tmp4, tmp5,
9232             n_tmp6);
9233     addq(in_out2, 2 * size);
9234     subl(in_out1, 3 * size);
9235     jmp(L_processPartitions);
9236 
9237   bind(L_exit);
9238 }
9239 #else
9240 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
9241                                      Register tmp1, Register tmp2, Register tmp3,
9242                                      XMMRegister xtmp1, XMMRegister xtmp2) {
9243   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
9244   if (n > 0) {
9245     addl(tmp3, n * 256 * 8);
9246   }
9247   //    Q1 = TABLEExt[n][B & 0xFF];
9248   movl(tmp1, in_out);
9249   andl(tmp1, 0x000000FF);
9250   shll(tmp1, 3);
9251   addl(tmp1, tmp3);
9252   movq(xtmp1, Address(tmp1, 0));
9253 
9254   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
9255   movl(tmp2, in_out);
9256   shrl(tmp2, 8);
9257   andl(tmp2, 0x000000FF);
9258   shll(tmp2, 3);
9259   addl(tmp2, tmp3);
9260   movq(xtmp2, Address(tmp2, 0));
9261 
9262   psllq(xtmp2, 8);
9263   pxor(xtmp1, xtmp2);
9264 
9265   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
9266   movl(tmp2, in_out);
9267   shrl(tmp2, 16);
9268   andl(tmp2, 0x000000FF);
9269   shll(tmp2, 3);
9270   addl(tmp2, tmp3);
9271   movq(xtmp2, Address(tmp2, 0));
9272 
9273   psllq(xtmp2, 16);
9274   pxor(xtmp1, xtmp2);
9275 
9276   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
9277   shrl(in_out, 24);
9278   andl(in_out, 0x000000FF);
9279   shll(in_out, 3);
9280   addl(in_out, tmp3);
9281   movq(xtmp2, Address(in_out, 0));
9282 
9283   psllq(xtmp2, 24);
9284   pxor(xtmp1, xtmp2); // Result in CXMM
9285   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
9286 }
9287 
9288 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
9289                                       Register in_out,
9290                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
9291                                       XMMRegister w_xtmp2,
9292                                       Register tmp1,
9293                                       Register n_tmp2, Register n_tmp3) {
9294   if (is_pclmulqdq_supported) {
9295     movdl(w_xtmp1, in_out);
9296 
9297     movl(tmp1, const_or_pre_comp_const_index);
9298     movdl(w_xtmp2, tmp1);
9299     pclmulqdq(w_xtmp1, w_xtmp2, 0);
9300     // Keep result in XMM since GPR is 32 bit in length
9301   } else {
9302     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
9303   }
9304 }
9305 
9306 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
9307                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9308                                      Register tmp1, Register tmp2,
9309                                      Register n_tmp3) {
9310   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9311   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9312 
9313   psllq(w_xtmp1, 1);
9314   movdl(tmp1, w_xtmp1);
9315   psrlq(w_xtmp1, 32);
9316   movdl(in_out, w_xtmp1);
9317 
9318   xorl(tmp2, tmp2);
9319   crc32(tmp2, tmp1, 4);
9320   xorl(in_out, tmp2);
9321 
9322   psllq(w_xtmp2, 1);
9323   movdl(tmp1, w_xtmp2);
9324   psrlq(w_xtmp2, 32);
9325   movdl(in1, w_xtmp2);
9326 
9327   xorl(tmp2, tmp2);
9328   crc32(tmp2, tmp1, 4);
9329   xorl(in1, tmp2);
9330   xorl(in_out, in1);
9331   xorl(in_out, in2);
9332 }
9333 
9334 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
9335                                        Register in_out1, Register in_out2, Register in_out3,
9336                                        Register tmp1, Register tmp2, Register tmp3,
9337                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9338                                        Register tmp4, Register tmp5,
9339                                        Register n_tmp6) {
9340   Label L_processPartitions;
9341   Label L_processPartition;
9342   Label L_exit;
9343 
9344   bind(L_processPartitions);
9345   cmpl(in_out1, 3 * size);
9346   jcc(Assembler::less, L_exit);
9347     xorl(tmp1, tmp1);
9348     xorl(tmp2, tmp2);
9349     movl(tmp3, in_out2);
9350     addl(tmp3, size);
9351 
9352     bind(L_processPartition);
9353       crc32(in_out3, Address(in_out2, 0), 4);
9354       crc32(tmp1, Address(in_out2, size), 4);
9355       crc32(tmp2, Address(in_out2, size*2), 4);
9356       crc32(in_out3, Address(in_out2, 0+4), 4);
9357       crc32(tmp1, Address(in_out2, size+4), 4);
9358       crc32(tmp2, Address(in_out2, size*2+4), 4);
9359       addl(in_out2, 8);
9360       cmpl(in_out2, tmp3);
9361       jcc(Assembler::less, L_processPartition);
9362 
9363         push(tmp3);
9364         push(in_out1);
9365         push(in_out2);
9366         tmp4 = tmp3;
9367         tmp5 = in_out1;
9368         n_tmp6 = in_out2;
9369 
9370       crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
9371             w_xtmp1, w_xtmp2, w_xtmp3,
9372             tmp4, tmp5,
9373             n_tmp6);
9374 
9375         pop(in_out2);
9376         pop(in_out1);
9377         pop(tmp3);
9378 
9379     addl(in_out2, 2 * size);
9380     subl(in_out1, 3 * size);
9381     jmp(L_processPartitions);
9382 
9383   bind(L_exit);
9384 }
9385 #endif //LP64
9386 
9387 #ifdef _LP64
9388 // Algorithm 2: Pipelined usage of the CRC32 instruction.
9389 // Input: A buffer I of L bytes.
9390 // Output: the CRC32C value of the buffer.
9391 // Notations:
9392 // Write L = 24N + r, with N = floor (L/24).
9393 // r = L mod 24 (0 <= r < 24).
9394 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
9395 // N quadwords, and R consists of r bytes.
9396 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
9397 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
9398 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
9399 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
9400 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
9401                                           Register tmp1, Register tmp2, Register tmp3,
9402                                           Register tmp4, Register tmp5, Register tmp6,
9403                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9404                                           bool is_pclmulqdq_supported) {
9405   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
9406   Label L_wordByWord;
9407   Label L_byteByByteProlog;
9408   Label L_byteByByte;
9409   Label L_exit;
9410 
9411   if (is_pclmulqdq_supported ) {
9412     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
9413     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
9414 
9415     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
9416     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
9417 
9418     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
9419     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
9420     assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
9421   } else {
9422     const_or_pre_comp_const_index[0] = 1;
9423     const_or_pre_comp_const_index[1] = 0;
9424 
9425     const_or_pre_comp_const_index[2] = 3;
9426     const_or_pre_comp_const_index[3] = 2;
9427 
9428     const_or_pre_comp_const_index[4] = 5;
9429     const_or_pre_comp_const_index[5] = 4;
9430    }
9431   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
9432                     in2, in1, in_out,
9433                     tmp1, tmp2, tmp3,
9434                     w_xtmp1, w_xtmp2, w_xtmp3,
9435                     tmp4, tmp5,
9436                     tmp6);
9437   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
9438                     in2, in1, in_out,
9439                     tmp1, tmp2, tmp3,
9440                     w_xtmp1, w_xtmp2, w_xtmp3,
9441                     tmp4, tmp5,
9442                     tmp6);
9443   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
9444                     in2, in1, in_out,
9445                     tmp1, tmp2, tmp3,
9446                     w_xtmp1, w_xtmp2, w_xtmp3,
9447                     tmp4, tmp5,
9448                     tmp6);
9449   movl(tmp1, in2);
9450   andl(tmp1, 0x00000007);
9451   negl(tmp1);
9452   addl(tmp1, in2);
9453   addq(tmp1, in1);
9454 
9455   BIND(L_wordByWord);
9456   cmpq(in1, tmp1);
9457   jcc(Assembler::greaterEqual, L_byteByByteProlog);
9458     crc32(in_out, Address(in1, 0), 4);
9459     addq(in1, 4);
9460     jmp(L_wordByWord);
9461 
9462   BIND(L_byteByByteProlog);
9463   andl(in2, 0x00000007);
9464   movl(tmp2, 1);
9465 
9466   BIND(L_byteByByte);
9467   cmpl(tmp2, in2);
9468   jccb(Assembler::greater, L_exit);
9469     crc32(in_out, Address(in1, 0), 1);
9470     incq(in1);
9471     incl(tmp2);
9472     jmp(L_byteByByte);
9473 
9474   BIND(L_exit);
9475 }
9476 #else
9477 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
9478                                           Register tmp1, Register  tmp2, Register tmp3,
9479                                           Register tmp4, Register  tmp5, Register tmp6,
9480                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9481                                           bool is_pclmulqdq_supported) {
9482   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
9483   Label L_wordByWord;
9484   Label L_byteByByteProlog;
9485   Label L_byteByByte;
9486   Label L_exit;
9487 
9488   if (is_pclmulqdq_supported) {
9489     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
9490     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
9491 
9492     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
9493     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
9494 
9495     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
9496     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
9497   } else {
9498     const_or_pre_comp_const_index[0] = 1;
9499     const_or_pre_comp_const_index[1] = 0;
9500 
9501     const_or_pre_comp_const_index[2] = 3;
9502     const_or_pre_comp_const_index[3] = 2;
9503 
9504     const_or_pre_comp_const_index[4] = 5;
9505     const_or_pre_comp_const_index[5] = 4;
9506   }
9507   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
9508                     in2, in1, in_out,
9509                     tmp1, tmp2, tmp3,
9510                     w_xtmp1, w_xtmp2, w_xtmp3,
9511                     tmp4, tmp5,
9512                     tmp6);
9513   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
9514                     in2, in1, in_out,
9515                     tmp1, tmp2, tmp3,
9516                     w_xtmp1, w_xtmp2, w_xtmp3,
9517                     tmp4, tmp5,
9518                     tmp6);
9519   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
9520                     in2, in1, in_out,
9521                     tmp1, tmp2, tmp3,
9522                     w_xtmp1, w_xtmp2, w_xtmp3,
9523                     tmp4, tmp5,
9524                     tmp6);
9525   movl(tmp1, in2);
9526   andl(tmp1, 0x00000007);
9527   negl(tmp1);
9528   addl(tmp1, in2);
9529   addl(tmp1, in1);
9530 
9531   BIND(L_wordByWord);
9532   cmpl(in1, tmp1);
9533   jcc(Assembler::greaterEqual, L_byteByByteProlog);
9534     crc32(in_out, Address(in1,0), 4);
9535     addl(in1, 4);
9536     jmp(L_wordByWord);
9537 
9538   BIND(L_byteByByteProlog);
9539   andl(in2, 0x00000007);
9540   movl(tmp2, 1);
9541 
9542   BIND(L_byteByByte);
9543   cmpl(tmp2, in2);
9544   jccb(Assembler::greater, L_exit);
9545     movb(tmp1, Address(in1, 0));
9546     crc32(in_out, tmp1, 1);
9547     incl(in1);
9548     incl(tmp2);
9549     jmp(L_byteByByte);
9550 
9551   BIND(L_exit);
9552 }
9553 #endif // LP64
9554 #undef BIND
9555 #undef BLOCK_COMMENT
9556 
9557 // Compress char[] array to byte[].
9558 //   ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
9559 //   @HotSpotIntrinsicCandidate
9560 //   private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
9561 //     for (int i = 0; i < len; i++) {
9562 //       int c = src[srcOff++];
9563 //       if (c >>> 8 != 0) {
9564 //         return 0;
9565 //       }
9566 //       dst[dstOff++] = (byte)c;
9567 //     }
9568 //     return len;
9569 //   }
9570 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
9571   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
9572   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
9573   Register tmp5, Register result) {
9574   Label copy_chars_loop, return_length, return_zero, done;
9575 
9576   // rsi: src
9577   // rdi: dst
9578   // rdx: len
9579   // rcx: tmp5
9580   // rax: result
9581 
9582   // rsi holds start addr of source char[] to be compressed
9583   // rdi holds start addr of destination byte[]
9584   // rdx holds length
9585 
9586   assert(len != result, "");
9587 
9588   // save length for return
9589   push(len);
9590 
9591   if ((UseAVX > 2) && // AVX512
9592     VM_Version::supports_avx512vlbw() &&
9593     VM_Version::supports_bmi2()) {
9594 
9595     Label copy_32_loop, copy_loop_tail, below_threshold;
9596 
9597     // alignment
9598     Label post_alignment;
9599 
9600     // if length of the string is less than 16, handle it in an old fashioned way
9601     testl(len, -32);
9602     jcc(Assembler::zero, below_threshold);
9603 
9604     // First check whether a character is compressable ( <= 0xFF).
9605     // Create mask to test for Unicode chars inside zmm vector
9606     movl(result, 0x00FF);
9607     evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
9608 
9609     testl(len, -64);
9610     jcc(Assembler::zero, post_alignment);
9611 
9612     movl(tmp5, dst);
9613     andl(tmp5, (32 - 1));
9614     negl(tmp5);
9615     andl(tmp5, (32 - 1));
9616 
9617     // bail out when there is nothing to be done
9618     testl(tmp5, 0xFFFFFFFF);
9619     jcc(Assembler::zero, post_alignment);
9620 
9621     // ~(~0 << len), where len is the # of remaining elements to process
9622     movl(result, 0xFFFFFFFF);
9623     shlxl(result, result, tmp5);
9624     notl(result);
9625     kmovdl(k3, result);
9626 
9627     evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit);
9628     evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
9629     ktestd(k2, k3);
9630     jcc(Assembler::carryClear, return_zero);
9631 
9632     evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit);
9633 
9634     addptr(src, tmp5);
9635     addptr(src, tmp5);
9636     addptr(dst, tmp5);
9637     subl(len, tmp5);
9638 
9639     bind(post_alignment);
9640     // end of alignment
9641 
9642     movl(tmp5, len);
9643     andl(tmp5, (32 - 1));    // tail count (in chars)
9644     andl(len, ~(32 - 1));    // vector count (in chars)
9645     jcc(Assembler::zero, copy_loop_tail);
9646 
9647     lea(src, Address(src, len, Address::times_2));
9648     lea(dst, Address(dst, len, Address::times_1));
9649     negptr(len);
9650 
9651     bind(copy_32_loop);
9652     evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
9653     evpcmpuw(k2, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
9654     kortestdl(k2, k2);
9655     jcc(Assembler::carryClear, return_zero);
9656 
9657     // All elements in current processed chunk are valid candidates for
9658     // compression. Write a truncated byte elements to the memory.
9659     evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
9660     addptr(len, 32);
9661     jcc(Assembler::notZero, copy_32_loop);
9662 
9663     bind(copy_loop_tail);
9664     // bail out when there is nothing to be done
9665     testl(tmp5, 0xFFFFFFFF);
9666     jcc(Assembler::zero, return_length);
9667 
9668     movl(len, tmp5);
9669 
9670     // ~(~0 << len), where len is the # of remaining elements to process
9671     movl(result, 0xFFFFFFFF);
9672     shlxl(result, result, len);
9673     notl(result);
9674 
9675     kmovdl(k3, result);
9676 
9677     evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit);
9678     evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
9679     ktestd(k2, k3);
9680     jcc(Assembler::carryClear, return_zero);
9681 
9682     evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit);
9683     jmp(return_length);
9684 
9685     bind(below_threshold);
9686   }
9687 
9688   if (UseSSE42Intrinsics) {
9689     Label copy_32_loop, copy_16, copy_tail;
9690 
9691     movl(result, len);
9692 
9693     movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
9694 
9695     // vectored compression
9696     andl(len, 0xfffffff0);    // vector count (in chars)
9697     andl(result, 0x0000000f);    // tail count (in chars)
9698     testl(len, len);
9699     jcc(Assembler::zero, copy_16);
9700 
9701     // compress 16 chars per iter
9702     movdl(tmp1Reg, tmp5);
9703     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
9704     pxor(tmp4Reg, tmp4Reg);
9705 
9706     lea(src, Address(src, len, Address::times_2));
9707     lea(dst, Address(dst, len, Address::times_1));
9708     negptr(len);
9709 
9710     bind(copy_32_loop);
9711     movdqu(tmp2Reg, Address(src, len, Address::times_2));     // load 1st 8 characters
9712     por(tmp4Reg, tmp2Reg);
9713     movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
9714     por(tmp4Reg, tmp3Reg);
9715     ptest(tmp4Reg, tmp1Reg);       // check for Unicode chars in next vector
9716     jcc(Assembler::notZero, return_zero);
9717     packuswb(tmp2Reg, tmp3Reg);    // only ASCII chars; compress each to 1 byte
9718     movdqu(Address(dst, len, Address::times_1), tmp2Reg);
9719     addptr(len, 16);
9720     jcc(Assembler::notZero, copy_32_loop);
9721 
9722     // compress next vector of 8 chars (if any)
9723     bind(copy_16);
9724     movl(len, result);
9725     andl(len, 0xfffffff8);    // vector count (in chars)
9726     andl(result, 0x00000007);    // tail count (in chars)
9727     testl(len, len);
9728     jccb(Assembler::zero, copy_tail);
9729 
9730     movdl(tmp1Reg, tmp5);
9731     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
9732     pxor(tmp3Reg, tmp3Reg);
9733 
9734     movdqu(tmp2Reg, Address(src, 0));
9735     ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in vector
9736     jccb(Assembler::notZero, return_zero);
9737     packuswb(tmp2Reg, tmp3Reg);    // only LATIN1 chars; compress each to 1 byte
9738     movq(Address(dst, 0), tmp2Reg);
9739     addptr(src, 16);
9740     addptr(dst, 8);
9741 
9742     bind(copy_tail);
9743     movl(len, result);
9744   }
9745   // compress 1 char per iter
9746   testl(len, len);
9747   jccb(Assembler::zero, return_length);
9748   lea(src, Address(src, len, Address::times_2));
9749   lea(dst, Address(dst, len, Address::times_1));
9750   negptr(len);
9751 
9752   bind(copy_chars_loop);
9753   load_unsigned_short(result, Address(src, len, Address::times_2));
9754   testl(result, 0xff00);      // check if Unicode char
9755   jccb(Assembler::notZero, return_zero);
9756   movb(Address(dst, len, Address::times_1), result);  // ASCII char; compress to 1 byte
9757   increment(len);
9758   jcc(Assembler::notZero, copy_chars_loop);
9759 
9760   // if compression succeeded, return length
9761   bind(return_length);
9762   pop(result);
9763   jmpb(done);
9764 
9765   // if compression failed, return 0
9766   bind(return_zero);
9767   xorl(result, result);
9768   addptr(rsp, wordSize);
9769 
9770   bind(done);
9771 }
9772 
9773 // Inflate byte[] array to char[].
9774 //   ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
9775 //   @HotSpotIntrinsicCandidate
9776 //   private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
9777 //     for (int i = 0; i < len; i++) {
9778 //       dst[dstOff++] = (char)(src[srcOff++] & 0xff);
9779 //     }
9780 //   }
9781 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
9782   XMMRegister tmp1, Register tmp2) {
9783   Label copy_chars_loop, done, below_threshold;
9784   // rsi: src
9785   // rdi: dst
9786   // rdx: len
9787   // rcx: tmp2
9788 
9789   // rsi holds start addr of source byte[] to be inflated
9790   // rdi holds start addr of destination char[]
9791   // rdx holds length
9792   assert_different_registers(src, dst, len, tmp2);
9793 
9794   if ((UseAVX > 2) && // AVX512
9795     VM_Version::supports_avx512vlbw() &&
9796     VM_Version::supports_bmi2()) {
9797 
9798     Label copy_32_loop, copy_tail;
9799     Register tmp3_aliased = len;
9800 
9801     // if length of the string is less than 16, handle it in an old fashioned way
9802     testl(len, -16);
9803     jcc(Assembler::zero, below_threshold);
9804 
9805     // In order to use only one arithmetic operation for the main loop we use
9806     // this pre-calculation
9807     movl(tmp2, len);
9808     andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
9809     andl(len, -32);     // vector count
9810     jccb(Assembler::zero, copy_tail);
9811 
9812     lea(src, Address(src, len, Address::times_1));
9813     lea(dst, Address(dst, len, Address::times_2));
9814     negptr(len);
9815 
9816 
9817     // inflate 32 chars per iter
9818     bind(copy_32_loop);
9819     vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
9820     evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
9821     addptr(len, 32);
9822     jcc(Assembler::notZero, copy_32_loop);
9823 
9824     bind(copy_tail);
9825     // bail out when there is nothing to be done
9826     testl(tmp2, -1); // we don't destroy the contents of tmp2 here
9827     jcc(Assembler::zero, done);
9828 
9829     // ~(~0 << length), where length is the # of remaining elements to process
9830     movl(tmp3_aliased, -1);
9831     shlxl(tmp3_aliased, tmp3_aliased, tmp2);
9832     notl(tmp3_aliased);
9833     kmovdl(k2, tmp3_aliased);
9834     evpmovzxbw(tmp1, k2, Address(src, 0), Assembler::AVX_512bit);
9835     evmovdquw(Address(dst, 0), k2, tmp1, Assembler::AVX_512bit);
9836 
9837     jmp(done);
9838   }
9839   if (UseSSE42Intrinsics) {
9840     Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
9841 
9842     movl(tmp2, len);
9843 
9844     if (UseAVX > 1) {
9845       andl(tmp2, (16 - 1));
9846       andl(len, -16);
9847       jccb(Assembler::zero, copy_new_tail);
9848     } else {
9849       andl(tmp2, 0x00000007);   // tail count (in chars)
9850       andl(len, 0xfffffff8);    // vector count (in chars)
9851       jccb(Assembler::zero, copy_tail);
9852     }
9853 
9854     // vectored inflation
9855     lea(src, Address(src, len, Address::times_1));
9856     lea(dst, Address(dst, len, Address::times_2));
9857     negptr(len);
9858 
9859     if (UseAVX > 1) {
9860       bind(copy_16_loop);
9861       vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
9862       vmovdqu(Address(dst, len, Address::times_2), tmp1);
9863       addptr(len, 16);
9864       jcc(Assembler::notZero, copy_16_loop);
9865 
9866       bind(below_threshold);
9867       bind(copy_new_tail);
9868       if ((UseAVX > 2) &&
9869         VM_Version::supports_avx512vlbw() &&
9870         VM_Version::supports_bmi2()) {
9871         movl(tmp2, len);
9872       } else {
9873         movl(len, tmp2);
9874       }
9875       andl(tmp2, 0x00000007);
9876       andl(len, 0xFFFFFFF8);
9877       jccb(Assembler::zero, copy_tail);
9878 
9879       pmovzxbw(tmp1, Address(src, 0));
9880       movdqu(Address(dst, 0), tmp1);
9881       addptr(src, 8);
9882       addptr(dst, 2 * 8);
9883 
9884       jmp(copy_tail, true);
9885     }
9886 
9887     // inflate 8 chars per iter
9888     bind(copy_8_loop);
9889     pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
9890     movdqu(Address(dst, len, Address::times_2), tmp1);
9891     addptr(len, 8);
9892     jcc(Assembler::notZero, copy_8_loop);
9893 
9894     bind(copy_tail);
9895     movl(len, tmp2);
9896 
9897     cmpl(len, 4);
9898     jccb(Assembler::less, copy_bytes);
9899 
9900     movdl(tmp1, Address(src, 0));  // load 4 byte chars
9901     pmovzxbw(tmp1, tmp1);
9902     movq(Address(dst, 0), tmp1);
9903     subptr(len, 4);
9904     addptr(src, 4);
9905     addptr(dst, 8);
9906 
9907     bind(copy_bytes);
9908   } else {
9909     bind(below_threshold);
9910   }
9911 
9912   testl(len, len);
9913   jccb(Assembler::zero, done);
9914   lea(src, Address(src, len, Address::times_1));
9915   lea(dst, Address(dst, len, Address::times_2));
9916   negptr(len);
9917 
9918   // inflate 1 char per iter
9919   bind(copy_chars_loop);
9920   load_unsigned_byte(tmp2, Address(src, len, Address::times_1));  // load byte char
9921   movw(Address(dst, len, Address::times_2), tmp2);  // inflate byte char to word
9922   increment(len);
9923   jcc(Assembler::notZero, copy_chars_loop);
9924 
9925   bind(done);
9926 }
9927 
9928 #ifdef _LP64
9929 void MacroAssembler::cache_wb(Address line)
9930 {
9931   // 64 bit cpus always support clflush
9932   assert(VM_Version::supports_clflush(), "clflush should be available");
9933   bool optimized = VM_Version::supports_clflushopt();
9934   bool no_evict = VM_Version::supports_clwb();
9935 
9936   // prefer clwb (writeback without evict) otherwise
9937   // prefer clflushopt (potentially parallel writeback with evict)
9938   // otherwise fallback on clflush (serial writeback with evict)
9939 
9940   if (optimized) {
9941     if (no_evict) {
9942       clwb(line);
9943     } else {
9944       clflushopt(line);
9945     }
9946   } else {
9947     // no need for fence when using CLFLUSH
9948     clflush(line);
9949   }
9950 }
9951 
9952 void MacroAssembler::cache_wbsync(bool is_pre)
9953 {
9954   assert(VM_Version::supports_clflush(), "clflush should be available");
9955   bool optimized = VM_Version::supports_clflushopt();
9956   bool no_evict = VM_Version::supports_clwb();
9957 
9958   // pick the correct implementation
9959 
9960   if (!is_pre && (optimized || no_evict)) {
9961     // need an sfence for post flush when using clflushopt or clwb
9962     // otherwise no no need for any synchroniaztion
9963 
9964     sfence();
9965   }
9966 }
9967 #endif // _LP64
9968 
9969 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
9970   switch (cond) {
9971     // Note some conditions are synonyms for others
9972     case Assembler::zero:         return Assembler::notZero;
9973     case Assembler::notZero:      return Assembler::zero;
9974     case Assembler::less:         return Assembler::greaterEqual;
9975     case Assembler::lessEqual:    return Assembler::greater;
9976     case Assembler::greater:      return Assembler::lessEqual;
9977     case Assembler::greaterEqual: return Assembler::less;
9978     case Assembler::below:        return Assembler::aboveEqual;
9979     case Assembler::belowEqual:   return Assembler::above;
9980     case Assembler::above:        return Assembler::belowEqual;
9981     case Assembler::aboveEqual:   return Assembler::below;
9982     case Assembler::overflow:     return Assembler::noOverflow;
9983     case Assembler::noOverflow:   return Assembler::overflow;
9984     case Assembler::negative:     return Assembler::positive;
9985     case Assembler::positive:     return Assembler::negative;
9986     case Assembler::parity:       return Assembler::noParity;
9987     case Assembler::noParity:     return Assembler::parity;
9988   }
9989   ShouldNotReachHere(); return Assembler::overflow;
9990 }
9991 
9992 SkipIfEqual::SkipIfEqual(
9993     MacroAssembler* masm, const bool* flag_addr, bool value) {
9994   _masm = masm;
9995   _masm->cmp8(ExternalAddress((address)flag_addr), value);
9996   _masm->jcc(Assembler::equal, _label);
9997 }
9998 
9999 SkipIfEqual::~SkipIfEqual() {
10000   _masm->bind(_label);
10001 }
10002 
10003 // 32-bit Windows has its own fast-path implementation
10004 // of get_thread
10005 #if !defined(WIN32) || defined(_LP64)
10006 
10007 // This is simply a call to Thread::current()
10008 void MacroAssembler::get_thread(Register thread) {
10009   if (thread != rax) {
10010     push(rax);
10011   }
10012   LP64_ONLY(push(rdi);)
10013   LP64_ONLY(push(rsi);)
10014   push(rdx);
10015   push(rcx);
10016 #ifdef _LP64
10017   push(r8);
10018   push(r9);
10019   push(r10);
10020   push(r11);
10021 #endif
10022 
10023   MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
10024 
10025 #ifdef _LP64
10026   pop(r11);
10027   pop(r10);
10028   pop(r9);
10029   pop(r8);
10030 #endif
10031   pop(rcx);
10032   pop(rdx);
10033   LP64_ONLY(pop(rsi);)
10034   LP64_ONLY(pop(rdi);)
10035   if (thread != rax) {
10036     mov(thread, rax);
10037     pop(rax);
10038   }
10039 }
10040 
10041 #endif