1 /*
   2  * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "jvm.h"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "compiler/disassembler.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "gc/shared/collectedHeap.inline.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 #include "memory/resourceArea.hpp"
  35 #include "memory/universe.hpp"
  36 #include "oops/accessDecorators.hpp"
  37 #include "oops/compressedOops.inline.hpp"
  38 #include "oops/klass.inline.hpp"
  39 #include "prims/methodHandles.hpp"
  40 #include "runtime/biasedLocking.hpp"
  41 #include "runtime/flags/flagSetting.hpp"
  42 #include "runtime/interfaceSupport.inline.hpp"
  43 #include "runtime/objectMonitor.hpp"
  44 #include "runtime/os.hpp"
  45 #include "runtime/safepoint.hpp"
  46 #include "runtime/safepointMechanism.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/stubRoutines.hpp"
  49 #include "runtime/thread.hpp"
  50 #include "utilities/macros.hpp"
  51 #include "crc32c.h"
  52 #ifdef COMPILER2
  53 #include "opto/intrinsicnode.hpp"
  54 #endif
  55 
  56 #ifdef PRODUCT
  57 #define BLOCK_COMMENT(str) /* nothing */
  58 #define STOP(error) stop(error)
  59 #else
  60 #define BLOCK_COMMENT(str) block_comment(str)
  61 #define STOP(error) block_comment(error); stop(error)
  62 #endif
  63 
  64 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  65 
  66 #ifdef ASSERT
  67 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
  68 #endif
  69 
  70 static Assembler::Condition reverse[] = {
  71     Assembler::noOverflow     /* overflow      = 0x0 */ ,
  72     Assembler::overflow       /* noOverflow    = 0x1 */ ,
  73     Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
  74     Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
  75     Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
  76     Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
  77     Assembler::above          /* belowEqual    = 0x6 */ ,
  78     Assembler::belowEqual     /* above         = 0x7 */ ,
  79     Assembler::positive       /* negative      = 0x8 */ ,
  80     Assembler::negative       /* positive      = 0x9 */ ,
  81     Assembler::noParity       /* parity        = 0xa */ ,
  82     Assembler::parity         /* noParity      = 0xb */ ,
  83     Assembler::greaterEqual   /* less          = 0xc */ ,
  84     Assembler::less           /* greaterEqual  = 0xd */ ,
  85     Assembler::greater        /* lessEqual     = 0xe */ ,
  86     Assembler::lessEqual      /* greater       = 0xf, */
  87 
  88 };
  89 
  90 
  91 // Implementation of MacroAssembler
  92 
  93 // First all the versions that have distinct versions depending on 32/64 bit
  94 // Unless the difference is trivial (1 line or so).
  95 
  96 #ifndef _LP64
  97 
  98 // 32bit versions
  99 
 100 Address MacroAssembler::as_Address(AddressLiteral adr) {
 101   return Address(adr.target(), adr.rspec());
 102 }
 103 
 104 Address MacroAssembler::as_Address(ArrayAddress adr) {
 105   return Address::make_array(adr);
 106 }
 107 
 108 void MacroAssembler::call_VM_leaf_base(address entry_point,
 109                                        int number_of_arguments) {
 110   call(RuntimeAddress(entry_point));
 111   increment(rsp, number_of_arguments * wordSize);
 112 }
 113 
 114 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
 115   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 116 }
 117 
 118 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
 119   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 120 }
 121 
 122 void MacroAssembler::cmpoop_raw(Address src1, jobject obj) {
 123   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 124 }
 125 
 126 void MacroAssembler::cmpoop_raw(Register src1, jobject obj) {
 127   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 128 }
 129 
 130 void MacroAssembler::cmpoop(Address src1, jobject obj) {
 131   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 132   bs->obj_equals(this, src1, obj);
 133 }
 134 
 135 void MacroAssembler::cmpoop(Register src1, jobject obj) {
 136   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 137   bs->obj_equals(this, src1, obj);
 138 }
 139 
 140 void MacroAssembler::extend_sign(Register hi, Register lo) {
 141   // According to Intel Doc. AP-526, "Integer Divide", p.18.
 142   if (VM_Version::is_P6() && hi == rdx && lo == rax) {
 143     cdql();
 144   } else {
 145     movl(hi, lo);
 146     sarl(hi, 31);
 147   }
 148 }
 149 
 150 void MacroAssembler::jC2(Register tmp, Label& L) {
 151   // set parity bit if FPU flag C2 is set (via rax)
 152   save_rax(tmp);
 153   fwait(); fnstsw_ax();
 154   sahf();
 155   restore_rax(tmp);
 156   // branch
 157   jcc(Assembler::parity, L);
 158 }
 159 
 160 void MacroAssembler::jnC2(Register tmp, Label& L) {
 161   // set parity bit if FPU flag C2 is set (via rax)
 162   save_rax(tmp);
 163   fwait(); fnstsw_ax();
 164   sahf();
 165   restore_rax(tmp);
 166   // branch
 167   jcc(Assembler::noParity, L);
 168 }
 169 
 170 // 32bit can do a case table jump in one instruction but we no longer allow the base
 171 // to be installed in the Address class
 172 void MacroAssembler::jump(ArrayAddress entry) {
 173   jmp(as_Address(entry));
 174 }
 175 
 176 // Note: y_lo will be destroyed
 177 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 178   // Long compare for Java (semantics as described in JVM spec.)
 179   Label high, low, done;
 180 
 181   cmpl(x_hi, y_hi);
 182   jcc(Assembler::less, low);
 183   jcc(Assembler::greater, high);
 184   // x_hi is the return register
 185   xorl(x_hi, x_hi);
 186   cmpl(x_lo, y_lo);
 187   jcc(Assembler::below, low);
 188   jcc(Assembler::equal, done);
 189 
 190   bind(high);
 191   xorl(x_hi, x_hi);
 192   increment(x_hi);
 193   jmp(done);
 194 
 195   bind(low);
 196   xorl(x_hi, x_hi);
 197   decrementl(x_hi);
 198 
 199   bind(done);
 200 }
 201 
 202 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 203     mov_literal32(dst, (int32_t)src.target(), src.rspec());
 204 }
 205 
 206 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 207   // leal(dst, as_Address(adr));
 208   // see note in movl as to why we must use a move
 209   mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
 210 }
 211 
 212 void MacroAssembler::leave() {
 213   mov(rsp, rbp);
 214   pop(rbp);
 215 }
 216 
 217 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
 218   // Multiplication of two Java long values stored on the stack
 219   // as illustrated below. Result is in rdx:rax.
 220   //
 221   // rsp ---> [  ??  ] \               \
 222   //            ....    | y_rsp_offset  |
 223   //          [ y_lo ] /  (in bytes)    | x_rsp_offset
 224   //          [ y_hi ]                  | (in bytes)
 225   //            ....                    |
 226   //          [ x_lo ]                 /
 227   //          [ x_hi ]
 228   //            ....
 229   //
 230   // Basic idea: lo(result) = lo(x_lo * y_lo)
 231   //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
 232   Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
 233   Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
 234   Label quick;
 235   // load x_hi, y_hi and check if quick
 236   // multiplication is possible
 237   movl(rbx, x_hi);
 238   movl(rcx, y_hi);
 239   movl(rax, rbx);
 240   orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
 241   jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
 242   // do full multiplication
 243   // 1st step
 244   mull(y_lo);                                    // x_hi * y_lo
 245   movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
 246   // 2nd step
 247   movl(rax, x_lo);
 248   mull(rcx);                                     // x_lo * y_hi
 249   addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
 250   // 3rd step
 251   bind(quick);                                   // note: rbx, = 0 if quick multiply!
 252   movl(rax, x_lo);
 253   mull(y_lo);                                    // x_lo * y_lo
 254   addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
 255 }
 256 
 257 void MacroAssembler::lneg(Register hi, Register lo) {
 258   negl(lo);
 259   adcl(hi, 0);
 260   negl(hi);
 261 }
 262 
 263 void MacroAssembler::lshl(Register hi, Register lo) {
 264   // Java shift left long support (semantics as described in JVM spec., p.305)
 265   // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
 266   // shift value is in rcx !
 267   assert(hi != rcx, "must not use rcx");
 268   assert(lo != rcx, "must not use rcx");
 269   const Register s = rcx;                        // shift count
 270   const int      n = BitsPerWord;
 271   Label L;
 272   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 273   cmpl(s, n);                                    // if (s < n)
 274   jcc(Assembler::less, L);                       // else (s >= n)
 275   movl(hi, lo);                                  // x := x << n
 276   xorl(lo, lo);
 277   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 278   bind(L);                                       // s (mod n) < n
 279   shldl(hi, lo);                                 // x := x << s
 280   shll(lo);
 281 }
 282 
 283 
 284 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
 285   // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
 286   // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
 287   assert(hi != rcx, "must not use rcx");
 288   assert(lo != rcx, "must not use rcx");
 289   const Register s = rcx;                        // shift count
 290   const int      n = BitsPerWord;
 291   Label L;
 292   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 293   cmpl(s, n);                                    // if (s < n)
 294   jcc(Assembler::less, L);                       // else (s >= n)
 295   movl(lo, hi);                                  // x := x >> n
 296   if (sign_extension) sarl(hi, 31);
 297   else                xorl(hi, hi);
 298   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 299   bind(L);                                       // s (mod n) < n
 300   shrdl(lo, hi);                                 // x := x >> s
 301   if (sign_extension) sarl(hi);
 302   else                shrl(hi);
 303 }
 304 
 305 void MacroAssembler::movoop(Register dst, jobject obj) {
 306   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 307 }
 308 
 309 void MacroAssembler::movoop(Address dst, jobject obj) {
 310   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 311 }
 312 
 313 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 314   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 315 }
 316 
 317 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 318   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 319 }
 320 
 321 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 322   // scratch register is not used,
 323   // it is defined to match parameters of 64-bit version of this method.
 324   if (src.is_lval()) {
 325     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
 326   } else {
 327     movl(dst, as_Address(src));
 328   }
 329 }
 330 
 331 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 332   movl(as_Address(dst), src);
 333 }
 334 
 335 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 336   movl(dst, as_Address(src));
 337 }
 338 
 339 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 340 void MacroAssembler::movptr(Address dst, intptr_t src) {
 341   movl(dst, src);
 342 }
 343 
 344 
 345 void MacroAssembler::pop_callee_saved_registers() {
 346   pop(rcx);
 347   pop(rdx);
 348   pop(rdi);
 349   pop(rsi);
 350 }
 351 
 352 void MacroAssembler::pop_fTOS() {
 353   fld_d(Address(rsp, 0));
 354   addl(rsp, 2 * wordSize);
 355 }
 356 
 357 void MacroAssembler::push_callee_saved_registers() {
 358   push(rsi);
 359   push(rdi);
 360   push(rdx);
 361   push(rcx);
 362 }
 363 
 364 void MacroAssembler::push_fTOS() {
 365   subl(rsp, 2 * wordSize);
 366   fstp_d(Address(rsp, 0));
 367 }
 368 
 369 
 370 void MacroAssembler::pushoop(jobject obj) {
 371   push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
 372 }
 373 
 374 void MacroAssembler::pushklass(Metadata* obj) {
 375   push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
 376 }
 377 
 378 void MacroAssembler::pushptr(AddressLiteral src) {
 379   if (src.is_lval()) {
 380     push_literal32((int32_t)src.target(), src.rspec());
 381   } else {
 382     pushl(as_Address(src));
 383   }
 384 }
 385 
 386 void MacroAssembler::set_word_if_not_zero(Register dst) {
 387   xorl(dst, dst);
 388   set_byte_if_not_zero(dst);
 389 }
 390 
 391 static void pass_arg0(MacroAssembler* masm, Register arg) {
 392   masm->push(arg);
 393 }
 394 
 395 static void pass_arg1(MacroAssembler* masm, Register arg) {
 396   masm->push(arg);
 397 }
 398 
 399 static void pass_arg2(MacroAssembler* masm, Register arg) {
 400   masm->push(arg);
 401 }
 402 
 403 static void pass_arg3(MacroAssembler* masm, Register arg) {
 404   masm->push(arg);
 405 }
 406 
 407 #ifndef PRODUCT
 408 extern "C" void findpc(intptr_t x);
 409 #endif
 410 
 411 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
 412   // In order to get locks to work, we need to fake a in_VM state
 413   JavaThread* thread = JavaThread::current();
 414   JavaThreadState saved_state = thread->thread_state();
 415   thread->set_thread_state(_thread_in_vm);
 416   if (ShowMessageBoxOnError) {
 417     JavaThread* thread = JavaThread::current();
 418     JavaThreadState saved_state = thread->thread_state();
 419     thread->set_thread_state(_thread_in_vm);
 420     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 421       ttyLocker ttyl;
 422       BytecodeCounter::print();
 423     }
 424     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 425     // This is the value of eip which points to where verify_oop will return.
 426     if (os::message_box(msg, "Execution stopped, print registers?")) {
 427       print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
 428       BREAKPOINT;
 429     }
 430   }
 431   fatal("DEBUG MESSAGE: %s", msg);
 432 }
 433 
 434 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
 435   ttyLocker ttyl;
 436   FlagSetting fs(Debugging, true);
 437   tty->print_cr("eip = 0x%08x", eip);
 438 #ifndef PRODUCT
 439   if ((WizardMode || Verbose) && PrintMiscellaneous) {
 440     tty->cr();
 441     findpc(eip);
 442     tty->cr();
 443   }
 444 #endif
 445 #define PRINT_REG(rax) \
 446   { tty->print("%s = ", #rax); os::print_location(tty, rax); }
 447   PRINT_REG(rax);
 448   PRINT_REG(rbx);
 449   PRINT_REG(rcx);
 450   PRINT_REG(rdx);
 451   PRINT_REG(rdi);
 452   PRINT_REG(rsi);
 453   PRINT_REG(rbp);
 454   PRINT_REG(rsp);
 455 #undef PRINT_REG
 456   // Print some words near top of staack.
 457   int* dump_sp = (int*) rsp;
 458   for (int col1 = 0; col1 < 8; col1++) {
 459     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 460     os::print_location(tty, *dump_sp++);
 461   }
 462   for (int row = 0; row < 16; row++) {
 463     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 464     for (int col = 0; col < 8; col++) {
 465       tty->print(" 0x%08x", *dump_sp++);
 466     }
 467     tty->cr();
 468   }
 469   // Print some instructions around pc:
 470   Disassembler::decode((address)eip-64, (address)eip);
 471   tty->print_cr("--------");
 472   Disassembler::decode((address)eip, (address)eip+32);
 473 }
 474 
 475 void MacroAssembler::stop(const char* msg) {
 476   ExternalAddress message((address)msg);
 477   // push address of message
 478   pushptr(message.addr());
 479   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 480   pusha();                                            // push registers
 481   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
 482   hlt();
 483 }
 484 
 485 void MacroAssembler::warn(const char* msg) {
 486   push_CPU_state();
 487 
 488   ExternalAddress message((address) msg);
 489   // push address of message
 490   pushptr(message.addr());
 491 
 492   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
 493   addl(rsp, wordSize);       // discard argument
 494   pop_CPU_state();
 495 }
 496 
 497 void MacroAssembler::print_state() {
 498   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 499   pusha();                                            // push registers
 500 
 501   push_CPU_state();
 502   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
 503   pop_CPU_state();
 504 
 505   popa();
 506   addl(rsp, wordSize);
 507 }
 508 
 509 #else // _LP64
 510 
 511 // 64 bit versions
 512 
 513 Address MacroAssembler::as_Address(AddressLiteral adr) {
 514   // amd64 always does this as a pc-rel
 515   // we can be absolute or disp based on the instruction type
 516   // jmp/call are displacements others are absolute
 517   assert(!adr.is_lval(), "must be rval");
 518   assert(reachable(adr), "must be");
 519   return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
 520 
 521 }
 522 
 523 Address MacroAssembler::as_Address(ArrayAddress adr) {
 524   AddressLiteral base = adr.base();
 525   lea(rscratch1, base);
 526   Address index = adr.index();
 527   assert(index._disp == 0, "must not have disp"); // maybe it can?
 528   Address array(rscratch1, index._index, index._scale, index._disp);
 529   return array;
 530 }
 531 
 532 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
 533   Label L, E;
 534 
 535 #ifdef _WIN64
 536   // Windows always allocates space for it's register args
 537   assert(num_args <= 4, "only register arguments supported");
 538   subq(rsp,  frame::arg_reg_save_area_bytes);
 539 #endif
 540 
 541   // Align stack if necessary
 542   testl(rsp, 15);
 543   jcc(Assembler::zero, L);
 544 
 545   subq(rsp, 8);
 546   {
 547     call(RuntimeAddress(entry_point));
 548   }
 549   addq(rsp, 8);
 550   jmp(E);
 551 
 552   bind(L);
 553   {
 554     call(RuntimeAddress(entry_point));
 555   }
 556 
 557   bind(E);
 558 
 559 #ifdef _WIN64
 560   // restore stack pointer
 561   addq(rsp, frame::arg_reg_save_area_bytes);
 562 #endif
 563 
 564 }
 565 
 566 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
 567   assert(!src2.is_lval(), "should use cmpptr");
 568 
 569   if (reachable(src2)) {
 570     cmpq(src1, as_Address(src2));
 571   } else {
 572     lea(rscratch1, src2);
 573     Assembler::cmpq(src1, Address(rscratch1, 0));
 574   }
 575 }
 576 
 577 int MacroAssembler::corrected_idivq(Register reg) {
 578   // Full implementation of Java ldiv and lrem; checks for special
 579   // case as described in JVM spec., p.243 & p.271.  The function
 580   // returns the (pc) offset of the idivl instruction - may be needed
 581   // for implicit exceptions.
 582   //
 583   //         normal case                           special case
 584   //
 585   // input : rax: dividend                         min_long
 586   //         reg: divisor   (may not be eax/edx)   -1
 587   //
 588   // output: rax: quotient  (= rax idiv reg)       min_long
 589   //         rdx: remainder (= rax irem reg)       0
 590   assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
 591   static const int64_t min_long = 0x8000000000000000;
 592   Label normal_case, special_case;
 593 
 594   // check for special case
 595   cmp64(rax, ExternalAddress((address) &min_long));
 596   jcc(Assembler::notEqual, normal_case);
 597   xorl(rdx, rdx); // prepare rdx for possible special case (where
 598                   // remainder = 0)
 599   cmpq(reg, -1);
 600   jcc(Assembler::equal, special_case);
 601 
 602   // handle normal case
 603   bind(normal_case);
 604   cdqq();
 605   int idivq_offset = offset();
 606   idivq(reg);
 607 
 608   // normal and special case exit
 609   bind(special_case);
 610 
 611   return idivq_offset;
 612 }
 613 
 614 void MacroAssembler::decrementq(Register reg, int value) {
 615   if (value == min_jint) { subq(reg, value); return; }
 616   if (value <  0) { incrementq(reg, -value); return; }
 617   if (value == 0) {                        ; return; }
 618   if (value == 1 && UseIncDec) { decq(reg) ; return; }
 619   /* else */      { subq(reg, value)       ; return; }
 620 }
 621 
 622 void MacroAssembler::decrementq(Address dst, int value) {
 623   if (value == min_jint) { subq(dst, value); return; }
 624   if (value <  0) { incrementq(dst, -value); return; }
 625   if (value == 0) {                        ; return; }
 626   if (value == 1 && UseIncDec) { decq(dst) ; return; }
 627   /* else */      { subq(dst, value)       ; return; }
 628 }
 629 
 630 void MacroAssembler::incrementq(AddressLiteral dst) {
 631   if (reachable(dst)) {
 632     incrementq(as_Address(dst));
 633   } else {
 634     lea(rscratch1, dst);
 635     incrementq(Address(rscratch1, 0));
 636   }
 637 }
 638 
 639 void MacroAssembler::incrementq(Register reg, int value) {
 640   if (value == min_jint) { addq(reg, value); return; }
 641   if (value <  0) { decrementq(reg, -value); return; }
 642   if (value == 0) {                        ; return; }
 643   if (value == 1 && UseIncDec) { incq(reg) ; return; }
 644   /* else */      { addq(reg, value)       ; return; }
 645 }
 646 
 647 void MacroAssembler::incrementq(Address dst, int value) {
 648   if (value == min_jint) { addq(dst, value); return; }
 649   if (value <  0) { decrementq(dst, -value); return; }
 650   if (value == 0) {                        ; return; }
 651   if (value == 1 && UseIncDec) { incq(dst) ; return; }
 652   /* else */      { addq(dst, value)       ; return; }
 653 }
 654 
 655 // 32bit can do a case table jump in one instruction but we no longer allow the base
 656 // to be installed in the Address class
 657 void MacroAssembler::jump(ArrayAddress entry) {
 658   lea(rscratch1, entry.base());
 659   Address dispatch = entry.index();
 660   assert(dispatch._base == noreg, "must be");
 661   dispatch._base = rscratch1;
 662   jmp(dispatch);
 663 }
 664 
 665 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 666   ShouldNotReachHere(); // 64bit doesn't use two regs
 667   cmpq(x_lo, y_lo);
 668 }
 669 
 670 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 671     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 672 }
 673 
 674 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 675   mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
 676   movptr(dst, rscratch1);
 677 }
 678 
 679 void MacroAssembler::leave() {
 680   // %%% is this really better? Why not on 32bit too?
 681   emit_int8((unsigned char)0xC9); // LEAVE
 682 }
 683 
 684 void MacroAssembler::lneg(Register hi, Register lo) {
 685   ShouldNotReachHere(); // 64bit doesn't use two regs
 686   negq(lo);
 687 }
 688 
 689 void MacroAssembler::movoop(Register dst, jobject obj) {
 690   mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 691 }
 692 
 693 void MacroAssembler::movoop(Address dst, jobject obj) {
 694   mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 695   movq(dst, rscratch1);
 696 }
 697 
 698 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 699   mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 700 }
 701 
 702 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 703   mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 704   movq(dst, rscratch1);
 705 }
 706 
 707 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 708   if (src.is_lval()) {
 709     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 710   } else {
 711     if (reachable(src)) {
 712       movq(dst, as_Address(src));
 713     } else {
 714       lea(scratch, src);
 715       movq(dst, Address(scratch, 0));
 716     }
 717   }
 718 }
 719 
 720 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 721   movq(as_Address(dst), src);
 722 }
 723 
 724 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 725   movq(dst, as_Address(src));
 726 }
 727 
 728 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 729 void MacroAssembler::movptr(Address dst, intptr_t src) {
 730   mov64(rscratch1, src);
 731   movq(dst, rscratch1);
 732 }
 733 
 734 // These are mostly for initializing NULL
 735 void MacroAssembler::movptr(Address dst, int32_t src) {
 736   movslq(dst, src);
 737 }
 738 
 739 void MacroAssembler::movptr(Register dst, int32_t src) {
 740   mov64(dst, (intptr_t)src);
 741 }
 742 
 743 void MacroAssembler::pushoop(jobject obj) {
 744   movoop(rscratch1, obj);
 745   push(rscratch1);
 746 }
 747 
 748 void MacroAssembler::pushklass(Metadata* obj) {
 749   mov_metadata(rscratch1, obj);
 750   push(rscratch1);
 751 }
 752 
 753 void MacroAssembler::pushptr(AddressLiteral src) {
 754   lea(rscratch1, src);
 755   if (src.is_lval()) {
 756     push(rscratch1);
 757   } else {
 758     pushq(Address(rscratch1, 0));
 759   }
 760 }
 761 
 762 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 763   // we must set sp to zero to clear frame
 764   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
 765   // must clear fp, so that compiled frames are not confused; it is
 766   // possible that we need it only for debugging
 767   if (clear_fp) {
 768     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
 769   }
 770 
 771   // Always clear the pc because it could have been set by make_walkable()
 772   movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
 773   vzeroupper();
 774 }
 775 
 776 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 777                                          Register last_java_fp,
 778                                          address  last_java_pc) {
 779   vzeroupper();
 780   // determine last_java_sp register
 781   if (!last_java_sp->is_valid()) {
 782     last_java_sp = rsp;
 783   }
 784 
 785   // last_java_fp is optional
 786   if (last_java_fp->is_valid()) {
 787     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
 788            last_java_fp);
 789   }
 790 
 791   // last_java_pc is optional
 792   if (last_java_pc != NULL) {
 793     Address java_pc(r15_thread,
 794                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
 795     lea(rscratch1, InternalAddress(last_java_pc));
 796     movptr(java_pc, rscratch1);
 797   }
 798 
 799   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
 800 }
 801 
 802 static void pass_arg0(MacroAssembler* masm, Register arg) {
 803   if (c_rarg0 != arg ) {
 804     masm->mov(c_rarg0, arg);
 805   }
 806 }
 807 
 808 static void pass_arg1(MacroAssembler* masm, Register arg) {
 809   if (c_rarg1 != arg ) {
 810     masm->mov(c_rarg1, arg);
 811   }
 812 }
 813 
 814 static void pass_arg2(MacroAssembler* masm, Register arg) {
 815   if (c_rarg2 != arg ) {
 816     masm->mov(c_rarg2, arg);
 817   }
 818 }
 819 
 820 static void pass_arg3(MacroAssembler* masm, Register arg) {
 821   if (c_rarg3 != arg ) {
 822     masm->mov(c_rarg3, arg);
 823   }
 824 }
 825 
 826 void MacroAssembler::stop(const char* msg) {
 827   address rip = pc();
 828   pusha(); // get regs on stack
 829   lea(c_rarg0, ExternalAddress((address) msg));
 830   lea(c_rarg1, InternalAddress(rip));
 831   movq(c_rarg2, rsp); // pass pointer to regs array
 832   andq(rsp, -16); // align stack as required by ABI
 833   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
 834   hlt();
 835 }
 836 
 837 void MacroAssembler::warn(const char* msg) {
 838   push(rbp);
 839   movq(rbp, rsp);
 840   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 841   push_CPU_state();   // keeps alignment at 16 bytes
 842   lea(c_rarg0, ExternalAddress((address) msg));
 843   lea(rax, ExternalAddress(CAST_FROM_FN_PTR(address, warning)));
 844   call(rax);
 845   pop_CPU_state();
 846   mov(rsp, rbp);
 847   pop(rbp);
 848 }
 849 
 850 void MacroAssembler::print_state() {
 851   address rip = pc();
 852   pusha();            // get regs on stack
 853   push(rbp);
 854   movq(rbp, rsp);
 855   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 856   push_CPU_state();   // keeps alignment at 16 bytes
 857 
 858   lea(c_rarg0, InternalAddress(rip));
 859   lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
 860   call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
 861 
 862   pop_CPU_state();
 863   mov(rsp, rbp);
 864   pop(rbp);
 865   popa();
 866 }
 867 
 868 #ifndef PRODUCT
 869 extern "C" void findpc(intptr_t x);
 870 #endif
 871 
 872 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
 873   // In order to get locks to work, we need to fake a in_VM state
 874   if (ShowMessageBoxOnError) {
 875     JavaThread* thread = JavaThread::current();
 876     JavaThreadState saved_state = thread->thread_state();
 877     thread->set_thread_state(_thread_in_vm);
 878 #ifndef PRODUCT
 879     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 880       ttyLocker ttyl;
 881       BytecodeCounter::print();
 882     }
 883 #endif
 884     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 885     // XXX correct this offset for amd64
 886     // This is the value of eip which points to where verify_oop will return.
 887     if (os::message_box(msg, "Execution stopped, print registers?")) {
 888       print_state64(pc, regs);
 889       BREAKPOINT;
 890     }
 891   }
 892   fatal("DEBUG MESSAGE: %s", msg);
 893 }
 894 
 895 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
 896   ttyLocker ttyl;
 897   FlagSetting fs(Debugging, true);
 898   tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
 899 #ifndef PRODUCT
 900   tty->cr();
 901   findpc(pc);
 902   tty->cr();
 903 #endif
 904 #define PRINT_REG(rax, value) \
 905   { tty->print("%s = ", #rax); os::print_location(tty, value); }
 906   PRINT_REG(rax, regs[15]);
 907   PRINT_REG(rbx, regs[12]);
 908   PRINT_REG(rcx, regs[14]);
 909   PRINT_REG(rdx, regs[13]);
 910   PRINT_REG(rdi, regs[8]);
 911   PRINT_REG(rsi, regs[9]);
 912   PRINT_REG(rbp, regs[10]);
 913   PRINT_REG(rsp, regs[11]);
 914   PRINT_REG(r8 , regs[7]);
 915   PRINT_REG(r9 , regs[6]);
 916   PRINT_REG(r10, regs[5]);
 917   PRINT_REG(r11, regs[4]);
 918   PRINT_REG(r12, regs[3]);
 919   PRINT_REG(r13, regs[2]);
 920   PRINT_REG(r14, regs[1]);
 921   PRINT_REG(r15, regs[0]);
 922 #undef PRINT_REG
 923   // Print some words near top of staack.
 924   int64_t* rsp = (int64_t*) regs[11];
 925   int64_t* dump_sp = rsp;
 926   for (int col1 = 0; col1 < 8; col1++) {
 927     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 928     os::print_location(tty, *dump_sp++);
 929   }
 930   for (int row = 0; row < 25; row++) {
 931     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 932     for (int col = 0; col < 4; col++) {
 933       tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
 934     }
 935     tty->cr();
 936   }
 937   // Print some instructions around pc:
 938   Disassembler::decode((address)pc-64, (address)pc);
 939   tty->print_cr("--------");
 940   Disassembler::decode((address)pc, (address)pc+32);
 941 }
 942 
 943 #endif // _LP64
 944 
 945 // Now versions that are common to 32/64 bit
 946 
 947 void MacroAssembler::addptr(Register dst, int32_t imm32) {
 948   LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
 949 }
 950 
 951 void MacroAssembler::addptr(Register dst, Register src) {
 952   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
 953 }
 954 
 955 void MacroAssembler::addptr(Address dst, Register src) {
 956   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
 957 }
 958 
 959 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
 960   if (reachable(src)) {
 961     Assembler::addsd(dst, as_Address(src));
 962   } else {
 963     lea(rscratch1, src);
 964     Assembler::addsd(dst, Address(rscratch1, 0));
 965   }
 966 }
 967 
 968 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
 969   if (reachable(src)) {
 970     addss(dst, as_Address(src));
 971   } else {
 972     lea(rscratch1, src);
 973     addss(dst, Address(rscratch1, 0));
 974   }
 975 }
 976 
 977 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src) {
 978   if (reachable(src)) {
 979     Assembler::addpd(dst, as_Address(src));
 980   } else {
 981     lea(rscratch1, src);
 982     Assembler::addpd(dst, Address(rscratch1, 0));
 983   }
 984 }
 985 
 986 void MacroAssembler::align(int modulus) {
 987   align(modulus, offset());
 988 }
 989 
 990 void MacroAssembler::align(int modulus, int target) {
 991   if (target % modulus != 0) {
 992     nop(modulus - (target % modulus));
 993   }
 994 }
 995 
 996 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
 997   // Used in sign-masking with aligned address.
 998   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
 999   if (reachable(src)) {
1000     Assembler::andpd(dst, as_Address(src));
1001   } else {
1002     lea(scratch_reg, src);
1003     Assembler::andpd(dst, Address(scratch_reg, 0));
1004   }
1005 }
1006 
1007 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1008   // Used in sign-masking with aligned address.
1009   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1010   if (reachable(src)) {
1011     Assembler::andps(dst, as_Address(src));
1012   } else {
1013     lea(scratch_reg, src);
1014     Assembler::andps(dst, Address(scratch_reg, 0));
1015   }
1016 }
1017 
1018 void MacroAssembler::andptr(Register dst, int32_t imm32) {
1019   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1020 }
1021 
1022 void MacroAssembler::atomic_incl(Address counter_addr) {
1023   lock();
1024   incrementl(counter_addr);
1025 }
1026 
1027 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1028   if (reachable(counter_addr)) {
1029     atomic_incl(as_Address(counter_addr));
1030   } else {
1031     lea(scr, counter_addr);
1032     atomic_incl(Address(scr, 0));
1033   }
1034 }
1035 
1036 #ifdef _LP64
1037 void MacroAssembler::atomic_incq(Address counter_addr) {
1038   lock();
1039   incrementq(counter_addr);
1040 }
1041 
1042 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1043   if (reachable(counter_addr)) {
1044     atomic_incq(as_Address(counter_addr));
1045   } else {
1046     lea(scr, counter_addr);
1047     atomic_incq(Address(scr, 0));
1048   }
1049 }
1050 #endif
1051 
1052 // Writes to stack successive pages until offset reached to check for
1053 // stack overflow + shadow pages.  This clobbers tmp.
1054 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1055   movptr(tmp, rsp);
1056   // Bang stack for total size given plus shadow page size.
1057   // Bang one page at a time because large size can bang beyond yellow and
1058   // red zones.
1059   Label loop;
1060   bind(loop);
1061   movl(Address(tmp, (-os::vm_page_size())), size );
1062   subptr(tmp, os::vm_page_size());
1063   subl(size, os::vm_page_size());
1064   jcc(Assembler::greater, loop);
1065 
1066   // Bang down shadow pages too.
1067   // At this point, (tmp-0) is the last address touched, so don't
1068   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1069   // was post-decremented.)  Skip this address by starting at i=1, and
1070   // touch a few more pages below.  N.B.  It is important to touch all
1071   // the way down including all pages in the shadow zone.
1072   for (int i = 1; i < ((int)JavaThread::stack_shadow_zone_size() / os::vm_page_size()); i++) {
1073     // this could be any sized move but this is can be a debugging crumb
1074     // so the bigger the better.
1075     movptr(Address(tmp, (-i*os::vm_page_size())), size );
1076   }
1077 }
1078 
1079 void MacroAssembler::reserved_stack_check() {
1080     // testing if reserved zone needs to be enabled
1081     Label no_reserved_zone_enabling;
1082     Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1083     NOT_LP64(get_thread(rsi);)
1084 
1085     cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1086     jcc(Assembler::below, no_reserved_zone_enabling);
1087 
1088     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1089     jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1090     should_not_reach_here();
1091 
1092     bind(no_reserved_zone_enabling);
1093 }
1094 
1095 int MacroAssembler::biased_locking_enter(Register lock_reg,
1096                                          Register obj_reg,
1097                                          Register swap_reg,
1098                                          Register tmp_reg,
1099                                          bool swap_reg_contains_mark,
1100                                          Label& done,
1101                                          Label* slow_case,
1102                                          BiasedLockingCounters* counters) {
1103   assert(UseBiasedLocking, "why call this otherwise?");
1104   assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
1105   assert(tmp_reg != noreg, "tmp_reg must be supplied");
1106   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
1107   assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits, "biased locking makes assumptions about bit layout");
1108   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
1109   NOT_LP64( Address saved_mark_addr(lock_reg, 0); )
1110 
1111   if (PrintBiasedLockingStatistics && counters == NULL) {
1112     counters = BiasedLocking::counters();
1113   }
1114   // Biased locking
1115   // See whether the lock is currently biased toward our thread and
1116   // whether the epoch is still valid
1117   // Note that the runtime guarantees sufficient alignment of JavaThread
1118   // pointers to allow age to be placed into low bits
1119   // First check to see whether biasing is even enabled for this object
1120   Label cas_label;
1121   int null_check_offset = -1;
1122   if (!swap_reg_contains_mark) {
1123     null_check_offset = offset();
1124     movptr(swap_reg, mark_addr);
1125   }
1126   movptr(tmp_reg, swap_reg);
1127   andptr(tmp_reg, markWord::biased_lock_mask_in_place);
1128   cmpptr(tmp_reg, markWord::biased_lock_pattern);
1129   jcc(Assembler::notEqual, cas_label);
1130   // The bias pattern is present in the object's header. Need to check
1131   // whether the bias owner and the epoch are both still current.
1132 #ifndef _LP64
1133   // Note that because there is no current thread register on x86_32 we
1134   // need to store off the mark word we read out of the object to
1135   // avoid reloading it and needing to recheck invariants below. This
1136   // store is unfortunate but it makes the overall code shorter and
1137   // simpler.
1138   movptr(saved_mark_addr, swap_reg);
1139 #endif
1140   if (swap_reg_contains_mark) {
1141     null_check_offset = offset();
1142   }
1143   load_prototype_header(tmp_reg, obj_reg);
1144 #ifdef _LP64
1145   orptr(tmp_reg, r15_thread);
1146   xorptr(tmp_reg, swap_reg);
1147   Register header_reg = tmp_reg;
1148 #else
1149   xorptr(tmp_reg, swap_reg);
1150   get_thread(swap_reg);
1151   xorptr(swap_reg, tmp_reg);
1152   Register header_reg = swap_reg;
1153 #endif
1154   andptr(header_reg, ~((int) markWord::age_mask_in_place));
1155   if (counters != NULL) {
1156     cond_inc32(Assembler::zero,
1157                ExternalAddress((address) counters->biased_lock_entry_count_addr()));
1158   }
1159   jcc(Assembler::equal, done);
1160 
1161   Label try_revoke_bias;
1162   Label try_rebias;
1163 
1164   // At this point we know that the header has the bias pattern and
1165   // that we are not the bias owner in the current epoch. We need to
1166   // figure out more details about the state of the header in order to
1167   // know what operations can be legally performed on the object's
1168   // header.
1169 
1170   // If the low three bits in the xor result aren't clear, that means
1171   // the prototype header is no longer biased and we have to revoke
1172   // the bias on this object.
1173   testptr(header_reg, markWord::biased_lock_mask_in_place);
1174   jccb(Assembler::notZero, try_revoke_bias);
1175 
1176   // Biasing is still enabled for this data type. See whether the
1177   // epoch of the current bias is still valid, meaning that the epoch
1178   // bits of the mark word are equal to the epoch bits of the
1179   // prototype header. (Note that the prototype header's epoch bits
1180   // only change at a safepoint.) If not, attempt to rebias the object
1181   // toward the current thread. Note that we must be absolutely sure
1182   // that the current epoch is invalid in order to do this because
1183   // otherwise the manipulations it performs on the mark word are
1184   // illegal.
1185   testptr(header_reg, markWord::epoch_mask_in_place);
1186   jccb(Assembler::notZero, try_rebias);
1187 
1188   // The epoch of the current bias is still valid but we know nothing
1189   // about the owner; it might be set or it might be clear. Try to
1190   // acquire the bias of the object using an atomic operation. If this
1191   // fails we will go in to the runtime to revoke the object's bias.
1192   // Note that we first construct the presumed unbiased header so we
1193   // don't accidentally blow away another thread's valid bias.
1194   NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1195   andptr(swap_reg,
1196          markWord::biased_lock_mask_in_place | markWord::age_mask_in_place | markWord::epoch_mask_in_place);
1197 #ifdef _LP64
1198   movptr(tmp_reg, swap_reg);
1199   orptr(tmp_reg, r15_thread);
1200 #else
1201   get_thread(tmp_reg);
1202   orptr(tmp_reg, swap_reg);
1203 #endif
1204   lock();
1205   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1206   // If the biasing toward our thread failed, this means that
1207   // another thread succeeded in biasing it toward itself and we
1208   // need to revoke that bias. The revocation will occur in the
1209   // interpreter runtime in the slow case.
1210   if (counters != NULL) {
1211     cond_inc32(Assembler::zero,
1212                ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
1213   }
1214   if (slow_case != NULL) {
1215     jcc(Assembler::notZero, *slow_case);
1216   }
1217   jmp(done);
1218 
1219   bind(try_rebias);
1220   // At this point we know the epoch has expired, meaning that the
1221   // current "bias owner", if any, is actually invalid. Under these
1222   // circumstances _only_, we are allowed to use the current header's
1223   // value as the comparison value when doing the cas to acquire the
1224   // bias in the current epoch. In other words, we allow transfer of
1225   // the bias from one thread to another directly in this situation.
1226   //
1227   // FIXME: due to a lack of registers we currently blow away the age
1228   // bits in this situation. Should attempt to preserve them.
1229   load_prototype_header(tmp_reg, obj_reg);
1230 #ifdef _LP64
1231   orptr(tmp_reg, r15_thread);
1232 #else
1233   get_thread(swap_reg);
1234   orptr(tmp_reg, swap_reg);
1235   movptr(swap_reg, saved_mark_addr);
1236 #endif
1237   lock();
1238   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1239   // If the biasing toward our thread failed, then another thread
1240   // succeeded in biasing it toward itself and we need to revoke that
1241   // bias. The revocation will occur in the runtime in the slow case.
1242   if (counters != NULL) {
1243     cond_inc32(Assembler::zero,
1244                ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
1245   }
1246   if (slow_case != NULL) {
1247     jcc(Assembler::notZero, *slow_case);
1248   }
1249   jmp(done);
1250 
1251   bind(try_revoke_bias);
1252   // The prototype mark in the klass doesn't have the bias bit set any
1253   // more, indicating that objects of this data type are not supposed
1254   // to be biased any more. We are going to try to reset the mark of
1255   // this object to the prototype value and fall through to the
1256   // CAS-based locking scheme. Note that if our CAS fails, it means
1257   // that another thread raced us for the privilege of revoking the
1258   // bias of this particular object, so it's okay to continue in the
1259   // normal locking code.
1260   //
1261   // FIXME: due to a lack of registers we currently blow away the age
1262   // bits in this situation. Should attempt to preserve them.
1263   NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1264   load_prototype_header(tmp_reg, obj_reg);
1265   lock();
1266   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1267   // Fall through to the normal CAS-based lock, because no matter what
1268   // the result of the above CAS, some thread must have succeeded in
1269   // removing the bias bit from the object's header.
1270   if (counters != NULL) {
1271     cond_inc32(Assembler::zero,
1272                ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
1273   }
1274 
1275   bind(cas_label);
1276 
1277   return null_check_offset;
1278 }
1279 
1280 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1281   assert(UseBiasedLocking, "why call this otherwise?");
1282 
1283   // Check for biased locking unlock case, which is a no-op
1284   // Note: we do not have to check the thread ID for two reasons.
1285   // First, the interpreter checks for IllegalMonitorStateException at
1286   // a higher level. Second, if the bias was revoked while we held the
1287   // lock, the object could not be rebiased toward another thread, so
1288   // the bias bit would be clear.
1289   movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1290   andptr(temp_reg, markWord::biased_lock_mask_in_place);
1291   cmpptr(temp_reg, markWord::biased_lock_pattern);
1292   jcc(Assembler::equal, done);
1293 }
1294 
1295 #ifdef COMPILER2
1296 
1297 #if INCLUDE_RTM_OPT
1298 
1299 // Update rtm_counters based on abort status
1300 // input: abort_status
1301 //        rtm_counters (RTMLockingCounters*)
1302 // flags are killed
1303 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
1304 
1305   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
1306   if (PrintPreciseRTMLockingStatistics) {
1307     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
1308       Label check_abort;
1309       testl(abort_status, (1<<i));
1310       jccb(Assembler::equal, check_abort);
1311       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
1312       bind(check_abort);
1313     }
1314   }
1315 }
1316 
1317 // Branch if (random & (count-1) != 0), count is 2^n
1318 // tmp, scr and flags are killed
1319 void MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
1320   assert(tmp == rax, "");
1321   assert(scr == rdx, "");
1322   rdtsc(); // modifies EDX:EAX
1323   andptr(tmp, count-1);
1324   jccb(Assembler::notZero, brLabel);
1325 }
1326 
1327 // Perform abort ratio calculation, set no_rtm bit if high ratio
1328 // input:  rtm_counters_Reg (RTMLockingCounters* address)
1329 // tmpReg, rtm_counters_Reg and flags are killed
1330 void MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
1331                                                  Register rtm_counters_Reg,
1332                                                  RTMLockingCounters* rtm_counters,
1333                                                  Metadata* method_data) {
1334   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
1335 
1336   if (RTMLockingCalculationDelay > 0) {
1337     // Delay calculation
1338     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
1339     testptr(tmpReg, tmpReg);
1340     jccb(Assembler::equal, L_done);
1341   }
1342   // Abort ratio calculation only if abort_count > RTMAbortThreshold
1343   //   Aborted transactions = abort_count * 100
1344   //   All transactions = total_count *  RTMTotalCountIncrRate
1345   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
1346 
1347   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
1348   cmpptr(tmpReg, RTMAbortThreshold);
1349   jccb(Assembler::below, L_check_always_rtm2);
1350   imulptr(tmpReg, tmpReg, 100);
1351 
1352   Register scrReg = rtm_counters_Reg;
1353   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1354   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
1355   imulptr(scrReg, scrReg, RTMAbortRatio);
1356   cmpptr(tmpReg, scrReg);
1357   jccb(Assembler::below, L_check_always_rtm1);
1358   if (method_data != NULL) {
1359     // set rtm_state to "no rtm" in MDO
1360     mov_metadata(tmpReg, method_data);
1361     lock();
1362     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
1363   }
1364   jmpb(L_done);
1365   bind(L_check_always_rtm1);
1366   // Reload RTMLockingCounters* address
1367   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1368   bind(L_check_always_rtm2);
1369   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1370   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
1371   jccb(Assembler::below, L_done);
1372   if (method_data != NULL) {
1373     // set rtm_state to "always rtm" in MDO
1374     mov_metadata(tmpReg, method_data);
1375     lock();
1376     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
1377   }
1378   bind(L_done);
1379 }
1380 
1381 // Update counters and perform abort ratio calculation
1382 // input:  abort_status_Reg
1383 // rtm_counters_Reg, flags are killed
1384 void MacroAssembler::rtm_profiling(Register abort_status_Reg,
1385                                    Register rtm_counters_Reg,
1386                                    RTMLockingCounters* rtm_counters,
1387                                    Metadata* method_data,
1388                                    bool profile_rtm) {
1389 
1390   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1391   // update rtm counters based on rax value at abort
1392   // reads abort_status_Reg, updates flags
1393   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1394   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
1395   if (profile_rtm) {
1396     // Save abort status because abort_status_Reg is used by following code.
1397     if (RTMRetryCount > 0) {
1398       push(abort_status_Reg);
1399     }
1400     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1401     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
1402     // restore abort status
1403     if (RTMRetryCount > 0) {
1404       pop(abort_status_Reg);
1405     }
1406   }
1407 }
1408 
1409 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
1410 // inputs: retry_count_Reg
1411 //       : abort_status_Reg
1412 // output: retry_count_Reg decremented by 1
1413 // flags are killed
1414 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
1415   Label doneRetry;
1416   assert(abort_status_Reg == rax, "");
1417   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
1418   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
1419   // if reason is in 0x6 and retry count != 0 then retry
1420   andptr(abort_status_Reg, 0x6);
1421   jccb(Assembler::zero, doneRetry);
1422   testl(retry_count_Reg, retry_count_Reg);
1423   jccb(Assembler::zero, doneRetry);
1424   pause();
1425   decrementl(retry_count_Reg);
1426   jmp(retryLabel);
1427   bind(doneRetry);
1428 }
1429 
1430 // Spin and retry if lock is busy,
1431 // inputs: box_Reg (monitor address)
1432 //       : retry_count_Reg
1433 // output: retry_count_Reg decremented by 1
1434 //       : clear z flag if retry count exceeded
1435 // tmp_Reg, scr_Reg, flags are killed
1436 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
1437                                             Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
1438   Label SpinLoop, SpinExit, doneRetry;
1439   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1440 
1441   testl(retry_count_Reg, retry_count_Reg);
1442   jccb(Assembler::zero, doneRetry);
1443   decrementl(retry_count_Reg);
1444   movptr(scr_Reg, RTMSpinLoopCount);
1445 
1446   bind(SpinLoop);
1447   pause();
1448   decrementl(scr_Reg);
1449   jccb(Assembler::lessEqual, SpinExit);
1450   movptr(tmp_Reg, Address(box_Reg, owner_offset));
1451   testptr(tmp_Reg, tmp_Reg);
1452   jccb(Assembler::notZero, SpinLoop);
1453 
1454   bind(SpinExit);
1455   jmp(retryLabel);
1456   bind(doneRetry);
1457   incrementl(retry_count_Reg); // clear z flag
1458 }
1459 
1460 // Use RTM for normal stack locks
1461 // Input: objReg (object to lock)
1462 void MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
1463                                        Register retry_on_abort_count_Reg,
1464                                        RTMLockingCounters* stack_rtm_counters,
1465                                        Metadata* method_data, bool profile_rtm,
1466                                        Label& DONE_LABEL, Label& IsInflated) {
1467   assert(UseRTMForStackLocks, "why call this otherwise?");
1468   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1469   assert(tmpReg == rax, "");
1470   assert(scrReg == rdx, "");
1471   Label L_rtm_retry, L_decrement_retry, L_on_abort;
1472 
1473   if (RTMRetryCount > 0) {
1474     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1475     bind(L_rtm_retry);
1476   }
1477   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
1478   testptr(tmpReg, markWord::monitor_value);  // inflated vs stack-locked|neutral|biased
1479   jcc(Assembler::notZero, IsInflated);
1480 
1481   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1482     Label L_noincrement;
1483     if (RTMTotalCountIncrRate > 1) {
1484       // tmpReg, scrReg and flags are killed
1485       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
1486     }
1487     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
1488     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
1489     bind(L_noincrement);
1490   }
1491   xbegin(L_on_abort);
1492   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
1493   andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
1494   cmpptr(tmpReg, markWord::unlocked_value);            // bits = 001 unlocked
1495   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
1496 
1497   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1498   if (UseRTMXendForLockBusy) {
1499     xend();
1500     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
1501     jmp(L_decrement_retry);
1502   }
1503   else {
1504     xabort(0);
1505   }
1506   bind(L_on_abort);
1507   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1508     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
1509   }
1510   bind(L_decrement_retry);
1511   if (RTMRetryCount > 0) {
1512     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1513     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1514   }
1515 }
1516 
1517 // Use RTM for inflating locks
1518 // inputs: objReg (object to lock)
1519 //         boxReg (on-stack box address (displaced header location) - KILLED)
1520 //         tmpReg (ObjectMonitor address + markWord::monitor_value)
1521 void MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
1522                                           Register scrReg, Register retry_on_busy_count_Reg,
1523                                           Register retry_on_abort_count_Reg,
1524                                           RTMLockingCounters* rtm_counters,
1525                                           Metadata* method_data, bool profile_rtm,
1526                                           Label& DONE_LABEL) {
1527   assert(UseRTMLocking, "why call this otherwise?");
1528   assert(tmpReg == rax, "");
1529   assert(scrReg == rdx, "");
1530   Label L_rtm_retry, L_decrement_retry, L_on_abort;
1531   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1532 
1533   // Without cast to int32_t a movptr will destroy r10 which is typically obj
1534   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
1535   movptr(boxReg, tmpReg); // Save ObjectMonitor address
1536 
1537   if (RTMRetryCount > 0) {
1538     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
1539     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1540     bind(L_rtm_retry);
1541   }
1542   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1543     Label L_noincrement;
1544     if (RTMTotalCountIncrRate > 1) {
1545       // tmpReg, scrReg and flags are killed
1546       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
1547     }
1548     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1549     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
1550     bind(L_noincrement);
1551   }
1552   xbegin(L_on_abort);
1553   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
1554   movptr(tmpReg, Address(tmpReg, owner_offset));
1555   testptr(tmpReg, tmpReg);
1556   jcc(Assembler::zero, DONE_LABEL);
1557   if (UseRTMXendForLockBusy) {
1558     xend();
1559     jmp(L_decrement_retry);
1560   }
1561   else {
1562     xabort(0);
1563   }
1564   bind(L_on_abort);
1565   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1566   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1567     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
1568   }
1569   if (RTMRetryCount > 0) {
1570     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1571     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1572   }
1573 
1574   movptr(tmpReg, Address(boxReg, owner_offset)) ;
1575   testptr(tmpReg, tmpReg) ;
1576   jccb(Assembler::notZero, L_decrement_retry) ;
1577 
1578   // Appears unlocked - try to swing _owner from null to non-null.
1579   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1580 #ifdef _LP64
1581   Register threadReg = r15_thread;
1582 #else
1583   get_thread(scrReg);
1584   Register threadReg = scrReg;
1585 #endif
1586   lock();
1587   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
1588 
1589   if (RTMRetryCount > 0) {
1590     // success done else retry
1591     jccb(Assembler::equal, DONE_LABEL) ;
1592     bind(L_decrement_retry);
1593     // Spin and retry if lock is busy.
1594     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
1595   }
1596   else {
1597     bind(L_decrement_retry);
1598   }
1599 }
1600 
1601 #endif //  INCLUDE_RTM_OPT
1602 
1603 // Fast_Lock and Fast_Unlock used by C2
1604 
1605 // Because the transitions from emitted code to the runtime
1606 // monitorenter/exit helper stubs are so slow it's critical that
1607 // we inline both the stack-locking fast-path and the inflated fast path.
1608 //
1609 // See also: cmpFastLock and cmpFastUnlock.
1610 //
1611 // What follows is a specialized inline transliteration of the code
1612 // in enter() and exit(). If we're concerned about I$ bloat another
1613 // option would be to emit TrySlowEnter and TrySlowExit methods
1614 // at startup-time.  These methods would accept arguments as
1615 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
1616 // indications in the icc.ZFlag.  Fast_Lock and Fast_Unlock would simply
1617 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
1618 // In practice, however, the # of lock sites is bounded and is usually small.
1619 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
1620 // if the processor uses simple bimodal branch predictors keyed by EIP
1621 // Since the helper routines would be called from multiple synchronization
1622 // sites.
1623 //
1624 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
1625 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
1626 // to those specialized methods.  That'd give us a mostly platform-independent
1627 // implementation that the JITs could optimize and inline at their pleasure.
1628 // Done correctly, the only time we'd need to cross to native could would be
1629 // to park() or unpark() threads.  We'd also need a few more unsafe operators
1630 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
1631 // (b) explicit barriers or fence operations.
1632 //
1633 // TODO:
1634 //
1635 // *  Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
1636 //    This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
1637 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
1638 //    the lock operators would typically be faster than reifying Self.
1639 //
1640 // *  Ideally I'd define the primitives as:
1641 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
1642 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
1643 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
1644 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
1645 //    Furthermore the register assignments are overconstrained, possibly resulting in
1646 //    sub-optimal code near the synchronization site.
1647 //
1648 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
1649 //    Alternately, use a better sp-proximity test.
1650 //
1651 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
1652 //    Either one is sufficient to uniquely identify a thread.
1653 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
1654 //
1655 // *  Intrinsify notify() and notifyAll() for the common cases where the
1656 //    object is locked by the calling thread but the waitlist is empty.
1657 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
1658 //
1659 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
1660 //    But beware of excessive branch density on AMD Opterons.
1661 //
1662 // *  Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
1663 //    or failure of the fast-path.  If the fast-path fails then we pass
1664 //    control to the slow-path, typically in C.  In Fast_Lock and
1665 //    Fast_Unlock we often branch to DONE_LABEL, just to find that C2
1666 //    will emit a conditional branch immediately after the node.
1667 //    So we have branches to branches and lots of ICC.ZF games.
1668 //    Instead, it might be better to have C2 pass a "FailureLabel"
1669 //    into Fast_Lock and Fast_Unlock.  In the case of success, control
1670 //    will drop through the node.  ICC.ZF is undefined at exit.
1671 //    In the case of failure, the node will branch directly to the
1672 //    FailureLabel
1673 
1674 
1675 // obj: object to lock
1676 // box: on-stack box address (displaced header location) - KILLED
1677 // rax,: tmp -- KILLED
1678 // scr: tmp -- KILLED
1679 void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
1680                                Register scrReg, Register cx1Reg, Register cx2Reg,
1681                                BiasedLockingCounters* counters,
1682                                RTMLockingCounters* rtm_counters,
1683                                RTMLockingCounters* stack_rtm_counters,
1684                                Metadata* method_data,
1685                                bool use_rtm, bool profile_rtm) {
1686   // Ensure the register assignments are disjoint
1687   assert(tmpReg == rax, "");
1688 
1689   if (use_rtm) {
1690     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
1691   } else {
1692     assert(cx1Reg == noreg, "");
1693     assert(cx2Reg == noreg, "");
1694     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
1695   }
1696 
1697   if (counters != NULL) {
1698     atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
1699   }
1700 
1701   // Possible cases that we'll encounter in fast_lock
1702   // ------------------------------------------------
1703   // * Inflated
1704   //    -- unlocked
1705   //    -- Locked
1706   //       = by self
1707   //       = by other
1708   // * biased
1709   //    -- by Self
1710   //    -- by other
1711   // * neutral
1712   // * stack-locked
1713   //    -- by self
1714   //       = sp-proximity test hits
1715   //       = sp-proximity test generates false-negative
1716   //    -- by other
1717   //
1718 
1719   Label IsInflated, DONE_LABEL;
1720 
1721   // it's stack-locked, biased or neutral
1722   // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
1723   // order to reduce the number of conditional branches in the most common cases.
1724   // Beware -- there's a subtle invariant that fetch of the markword
1725   // at [FETCH], below, will never observe a biased encoding (*101b).
1726   // If this invariant is not held we risk exclusion (safety) failure.
1727   if (UseBiasedLocking && !UseOptoBiasInlining) {
1728     biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
1729   }
1730 
1731 #if INCLUDE_RTM_OPT
1732   if (UseRTMForStackLocks && use_rtm) {
1733     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
1734                       stack_rtm_counters, method_data, profile_rtm,
1735                       DONE_LABEL, IsInflated);
1736   }
1737 #endif // INCLUDE_RTM_OPT
1738 
1739   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
1740   testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
1741   jccb(Assembler::notZero, IsInflated);
1742 
1743   // Attempt stack-locking ...
1744   orptr (tmpReg, markWord::unlocked_value);
1745   movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
1746   lock();
1747   cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
1748   if (counters != NULL) {
1749     cond_inc32(Assembler::equal,
1750                ExternalAddress((address)counters->fast_path_entry_count_addr()));
1751   }
1752   jcc(Assembler::equal, DONE_LABEL);           // Success
1753 
1754   // Recursive locking.
1755   // The object is stack-locked: markword contains stack pointer to BasicLock.
1756   // Locked by current thread if difference with current SP is less than one page.
1757   subptr(tmpReg, rsp);
1758   // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
1759   andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
1760   movptr(Address(boxReg, 0), tmpReg);
1761   if (counters != NULL) {
1762     cond_inc32(Assembler::equal,
1763                ExternalAddress((address)counters->fast_path_entry_count_addr()));
1764   }
1765   jmp(DONE_LABEL);
1766 
1767   bind(IsInflated);
1768   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
1769 
1770 #if INCLUDE_RTM_OPT
1771   // Use the same RTM locking code in 32- and 64-bit VM.
1772   if (use_rtm) {
1773     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
1774                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
1775   } else {
1776 #endif // INCLUDE_RTM_OPT
1777 
1778 #ifndef _LP64
1779   // The object is inflated.
1780 
1781   // boxReg refers to the on-stack BasicLock in the current frame.
1782   // We'd like to write:
1783   //   set box->_displaced_header = markWord::unused_mark().  Any non-0 value suffices.
1784   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
1785   // additional latency as we have another ST in the store buffer that must drain.
1786 
1787   // avoid ST-before-CAS
1788   // register juggle because we need tmpReg for cmpxchgptr below
1789   movptr(scrReg, boxReg);
1790   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
1791 
1792   // Optimistic form: consider XORL tmpReg,tmpReg
1793   movptr(tmpReg, NULL_WORD);
1794 
1795   // Appears unlocked - try to swing _owner from null to non-null.
1796   // Ideally, I'd manifest "Self" with get_thread and then attempt
1797   // to CAS the register containing Self into m->Owner.
1798   // But we don't have enough registers, so instead we can either try to CAS
1799   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
1800   // we later store "Self" into m->Owner.  Transiently storing a stack address
1801   // (rsp or the address of the box) into  m->owner is harmless.
1802   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1803   lock();
1804   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1805   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
1806   // If we weren't able to swing _owner from NULL to the BasicLock
1807   // then take the slow path.
1808   jccb  (Assembler::notZero, DONE_LABEL);
1809   // update _owner from BasicLock to thread
1810   get_thread (scrReg);                    // beware: clobbers ICCs
1811   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
1812   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
1813 
1814   // If the CAS fails we can either retry or pass control to the slow-path.
1815   // We use the latter tactic.
1816   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1817   // If the CAS was successful ...
1818   //   Self has acquired the lock
1819   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1820   // Intentional fall-through into DONE_LABEL ...
1821 #else // _LP64
1822   // It's inflated
1823   movq(scrReg, tmpReg);
1824   xorq(tmpReg, tmpReg);
1825 
1826   lock();
1827   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1828   // Unconditionally set box->_displaced_header = markWord::unused_mark().
1829   // Without cast to int32_t movptr will destroy r10 which is typically obj.
1830   movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
1831   // Intentional fall-through into DONE_LABEL ...
1832   // Propagate ICC.ZF from CAS above into DONE_LABEL.
1833 #endif // _LP64
1834 #if INCLUDE_RTM_OPT
1835   } // use_rtm()
1836 #endif
1837   // DONE_LABEL is a hot target - we'd really like to place it at the
1838   // start of cache line by padding with NOPs.
1839   // See the AMD and Intel software optimization manuals for the
1840   // most efficient "long" NOP encodings.
1841   // Unfortunately none of our alignment mechanisms suffice.
1842   bind(DONE_LABEL);
1843 
1844   // At DONE_LABEL the icc ZFlag is set as follows ...
1845   // Fast_Unlock uses the same protocol.
1846   // ZFlag == 1 -> Success
1847   // ZFlag == 0 -> Failure - force control through the slow-path
1848 }
1849 
1850 // obj: object to unlock
1851 // box: box address (displaced header location), killed.  Must be EAX.
1852 // tmp: killed, cannot be obj nor box.
1853 //
1854 // Some commentary on balanced locking:
1855 //
1856 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
1857 // Methods that don't have provably balanced locking are forced to run in the
1858 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1859 // The interpreter provides two properties:
1860 // I1:  At return-time the interpreter automatically and quietly unlocks any
1861 //      objects acquired the current activation (frame).  Recall that the
1862 //      interpreter maintains an on-stack list of locks currently held by
1863 //      a frame.
1864 // I2:  If a method attempts to unlock an object that is not held by the
1865 //      the frame the interpreter throws IMSX.
1866 //
1867 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
1868 // B() doesn't have provably balanced locking so it runs in the interpreter.
1869 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
1870 // is still locked by A().
1871 //
1872 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
1873 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
1874 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
1875 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
1876 // Arguably given that the spec legislates the JNI case as undefined our implementation
1877 // could reasonably *avoid* checking owner in Fast_Unlock().
1878 // In the interest of performance we elide m->Owner==Self check in unlock.
1879 // A perfectly viable alternative is to elide the owner check except when
1880 // Xcheck:jni is enabled.
1881 
1882 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
1883   assert(boxReg == rax, "");
1884   assert_different_registers(objReg, boxReg, tmpReg);
1885 
1886   Label DONE_LABEL, Stacked, CheckSucc;
1887 
1888   // Critically, the biased locking test must have precedence over
1889   // and appear before the (box->dhw == 0) recursive stack-lock test.
1890   if (UseBiasedLocking && !UseOptoBiasInlining) {
1891     biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1892   }
1893 
1894 #if INCLUDE_RTM_OPT
1895   if (UseRTMForStackLocks && use_rtm) {
1896     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1897     Label L_regular_unlock;
1898     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
1899     andptr(tmpReg, markWord::biased_lock_mask_in_place);              // look at 3 lock bits
1900     cmpptr(tmpReg, markWord::unlocked_value);                         // bits = 001 unlocked
1901     jccb(Assembler::notEqual, L_regular_unlock);                      // if !HLE RegularLock
1902     xend();                                                           // otherwise end...
1903     jmp(DONE_LABEL);                                                  // ... and we're done
1904     bind(L_regular_unlock);
1905   }
1906 #endif
1907 
1908   cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD);                   // Examine the displaced header
1909   jcc   (Assembler::zero, DONE_LABEL);                              // 0 indicates recursive stack-lock
1910   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
1911   testptr(tmpReg, markWord::monitor_value);                         // Inflated?
1912   jccb  (Assembler::zero, Stacked);
1913 
1914   // It's inflated.
1915 #if INCLUDE_RTM_OPT
1916   if (use_rtm) {
1917     Label L_regular_inflated_unlock;
1918     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1919     movptr(boxReg, Address(tmpReg, owner_offset));
1920     testptr(boxReg, boxReg);
1921     jccb(Assembler::notZero, L_regular_inflated_unlock);
1922     xend();
1923     jmpb(DONE_LABEL);
1924     bind(L_regular_inflated_unlock);
1925   }
1926 #endif
1927 
1928   // Despite our balanced locking property we still check that m->_owner == Self
1929   // as java routines or native JNI code called by this thread might
1930   // have released the lock.
1931   // Refer to the comments in synchronizer.cpp for how we might encode extra
1932   // state in _succ so we can avoid fetching EntryList|cxq.
1933   //
1934   // I'd like to add more cases in fast_lock() and fast_unlock() --
1935   // such as recursive enter and exit -- but we have to be wary of
1936   // I$ bloat, T$ effects and BP$ effects.
1937   //
1938   // If there's no contention try a 1-0 exit.  That is, exit without
1939   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
1940   // we detect and recover from the race that the 1-0 exit admits.
1941   //
1942   // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
1943   // before it STs null into _owner, releasing the lock.  Updates
1944   // to data protected by the critical section must be visible before
1945   // we drop the lock (and thus before any other thread could acquire
1946   // the lock and observe the fields protected by the lock).
1947   // IA32's memory-model is SPO, so STs are ordered with respect to
1948   // each other and there's no need for an explicit barrier (fence).
1949   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
1950 #ifndef _LP64
1951   get_thread (boxReg);
1952 
1953   // Note that we could employ various encoding schemes to reduce
1954   // the number of loads below (currently 4) to just 2 or 3.
1955   // Refer to the comments in synchronizer.cpp.
1956   // In practice the chain of fetches doesn't seem to impact performance, however.
1957   xorptr(boxReg, boxReg);
1958   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1959   jccb  (Assembler::notZero, DONE_LABEL);
1960   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1961   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1962   jccb  (Assembler::notZero, CheckSucc);
1963   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1964   jmpb  (DONE_LABEL);
1965 
1966   bind (Stacked);
1967   // It's not inflated and it's not recursively stack-locked and it's not biased.
1968   // It must be stack-locked.
1969   // Try to reset the header to displaced header.
1970   // The "box" value on the stack is stable, so we can reload
1971   // and be assured we observe the same value as above.
1972   movptr(tmpReg, Address(boxReg, 0));
1973   lock();
1974   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
1975   // Intention fall-thru into DONE_LABEL
1976 
1977   // DONE_LABEL is a hot target - we'd really like to place it at the
1978   // start of cache line by padding with NOPs.
1979   // See the AMD and Intel software optimization manuals for the
1980   // most efficient "long" NOP encodings.
1981   // Unfortunately none of our alignment mechanisms suffice.
1982   bind (CheckSucc);
1983 #else // _LP64
1984   // It's inflated
1985   xorptr(boxReg, boxReg);
1986   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1987   jccb  (Assembler::notZero, DONE_LABEL);
1988   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1989   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1990   jccb  (Assembler::notZero, CheckSucc);
1991   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
1992   jmpb  (DONE_LABEL);
1993 
1994   // Try to avoid passing control into the slow_path ...
1995   Label LSuccess, LGoSlowPath ;
1996   bind  (CheckSucc);
1997 
1998   // The following optional optimization can be elided if necessary
1999   // Effectively: if (succ == null) goto SlowPath
2000   // The code reduces the window for a race, however,
2001   // and thus benefits performance.
2002   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2003   jccb  (Assembler::zero, LGoSlowPath);
2004 
2005   xorptr(boxReg, boxReg);
2006   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2007 
2008   // Memory barrier/fence
2009   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
2010   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
2011   // This is faster on Nehalem and AMD Shanghai/Barcelona.
2012   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2013   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
2014   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
2015   lock(); addl(Address(rsp, 0), 0);
2016 
2017   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2018   jccb  (Assembler::notZero, LSuccess);
2019 
2020   // Rare inopportune interleaving - race.
2021   // The successor vanished in the small window above.
2022   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
2023   // We need to ensure progress and succession.
2024   // Try to reacquire the lock.
2025   // If that fails then the new owner is responsible for succession and this
2026   // thread needs to take no further action and can exit via the fast path (success).
2027   // If the re-acquire succeeds then pass control into the slow path.
2028   // As implemented, this latter mode is horrible because we generated more
2029   // coherence traffic on the lock *and* artifically extended the critical section
2030   // length while by virtue of passing control into the slow path.
2031 
2032   // box is really RAX -- the following CMPXCHG depends on that binding
2033   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
2034   lock();
2035   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2036   // There's no successor so we tried to regrab the lock.
2037   // If that didn't work, then another thread grabbed the
2038   // lock so we're done (and exit was a success).
2039   jccb  (Assembler::notEqual, LSuccess);
2040   // Intentional fall-through into slow-path
2041 
2042   bind  (LGoSlowPath);
2043   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2044   jmpb  (DONE_LABEL);
2045 
2046   bind  (LSuccess);
2047   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
2048   jmpb  (DONE_LABEL);
2049 
2050   bind  (Stacked);
2051   movptr(tmpReg, Address (boxReg, 0));      // re-fetch
2052   lock();
2053   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
2054 
2055 #endif
2056   bind(DONE_LABEL);
2057 }
2058 #endif // COMPILER2
2059 
2060 void MacroAssembler::c2bool(Register x) {
2061   // implements x == 0 ? 0 : 1
2062   // note: must only look at least-significant byte of x
2063   //       since C-style booleans are stored in one byte
2064   //       only! (was bug)
2065   andl(x, 0xFF);
2066   setb(Assembler::notZero, x);
2067 }
2068 
2069 // Wouldn't need if AddressLiteral version had new name
2070 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
2071   Assembler::call(L, rtype);
2072 }
2073 
2074 void MacroAssembler::call(Register entry) {
2075   Assembler::call(entry);
2076 }
2077 
2078 void MacroAssembler::call(AddressLiteral entry) {
2079   if (reachable(entry)) {
2080     Assembler::call_literal(entry.target(), entry.rspec());
2081   } else {
2082     lea(rscratch1, entry);
2083     Assembler::call(rscratch1);
2084   }
2085 }
2086 
2087 void MacroAssembler::ic_call(address entry, jint method_index) {
2088   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
2089   movptr(rax, (intptr_t)Universe::non_oop_word());
2090   call(AddressLiteral(entry, rh));
2091 }
2092 
2093 // Implementation of call_VM versions
2094 
2095 void MacroAssembler::call_VM(Register oop_result,
2096                              address entry_point,
2097                              bool check_exceptions) {
2098   Label C, E;
2099   call(C, relocInfo::none);
2100   jmp(E);
2101 
2102   bind(C);
2103   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
2104   ret(0);
2105 
2106   bind(E);
2107 }
2108 
2109 void MacroAssembler::call_VM(Register oop_result,
2110                              address entry_point,
2111                              Register arg_1,
2112                              bool check_exceptions) {
2113   Label C, E;
2114   call(C, relocInfo::none);
2115   jmp(E);
2116 
2117   bind(C);
2118   pass_arg1(this, arg_1);
2119   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
2120   ret(0);
2121 
2122   bind(E);
2123 }
2124 
2125 void MacroAssembler::call_VM(Register oop_result,
2126                              address entry_point,
2127                              Register arg_1,
2128                              Register arg_2,
2129                              bool check_exceptions) {
2130   Label C, E;
2131   call(C, relocInfo::none);
2132   jmp(E);
2133 
2134   bind(C);
2135 
2136   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2137 
2138   pass_arg2(this, arg_2);
2139   pass_arg1(this, arg_1);
2140   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
2141   ret(0);
2142 
2143   bind(E);
2144 }
2145 
2146 void MacroAssembler::call_VM(Register oop_result,
2147                              address entry_point,
2148                              Register arg_1,
2149                              Register arg_2,
2150                              Register arg_3,
2151                              bool check_exceptions) {
2152   Label C, E;
2153   call(C, relocInfo::none);
2154   jmp(E);
2155 
2156   bind(C);
2157 
2158   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2159   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2160   pass_arg3(this, arg_3);
2161 
2162   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2163   pass_arg2(this, arg_2);
2164 
2165   pass_arg1(this, arg_1);
2166   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
2167   ret(0);
2168 
2169   bind(E);
2170 }
2171 
2172 void MacroAssembler::call_VM(Register oop_result,
2173                              Register last_java_sp,
2174                              address entry_point,
2175                              int number_of_arguments,
2176                              bool check_exceptions) {
2177   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2178   call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2179 }
2180 
2181 void MacroAssembler::call_VM(Register oop_result,
2182                              Register last_java_sp,
2183                              address entry_point,
2184                              Register arg_1,
2185                              bool check_exceptions) {
2186   pass_arg1(this, arg_1);
2187   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2188 }
2189 
2190 void MacroAssembler::call_VM(Register oop_result,
2191                              Register last_java_sp,
2192                              address entry_point,
2193                              Register arg_1,
2194                              Register arg_2,
2195                              bool check_exceptions) {
2196 
2197   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2198   pass_arg2(this, arg_2);
2199   pass_arg1(this, arg_1);
2200   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2201 }
2202 
2203 void MacroAssembler::call_VM(Register oop_result,
2204                              Register last_java_sp,
2205                              address entry_point,
2206                              Register arg_1,
2207                              Register arg_2,
2208                              Register arg_3,
2209                              bool check_exceptions) {
2210   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2211   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2212   pass_arg3(this, arg_3);
2213   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2214   pass_arg2(this, arg_2);
2215   pass_arg1(this, arg_1);
2216   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2217 }
2218 
2219 void MacroAssembler::super_call_VM(Register oop_result,
2220                                    Register last_java_sp,
2221                                    address entry_point,
2222                                    int number_of_arguments,
2223                                    bool check_exceptions) {
2224   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2225   MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2226 }
2227 
2228 void MacroAssembler::super_call_VM(Register oop_result,
2229                                    Register last_java_sp,
2230                                    address entry_point,
2231                                    Register arg_1,
2232                                    bool check_exceptions) {
2233   pass_arg1(this, arg_1);
2234   super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2235 }
2236 
2237 void MacroAssembler::super_call_VM(Register oop_result,
2238                                    Register last_java_sp,
2239                                    address entry_point,
2240                                    Register arg_1,
2241                                    Register arg_2,
2242                                    bool check_exceptions) {
2243 
2244   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2245   pass_arg2(this, arg_2);
2246   pass_arg1(this, arg_1);
2247   super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2248 }
2249 
2250 void MacroAssembler::super_call_VM(Register oop_result,
2251                                    Register last_java_sp,
2252                                    address entry_point,
2253                                    Register arg_1,
2254                                    Register arg_2,
2255                                    Register arg_3,
2256                                    bool check_exceptions) {
2257   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2258   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2259   pass_arg3(this, arg_3);
2260   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2261   pass_arg2(this, arg_2);
2262   pass_arg1(this, arg_1);
2263   super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2264 }
2265 
2266 void MacroAssembler::call_VM_base(Register oop_result,
2267                                   Register java_thread,
2268                                   Register last_java_sp,
2269                                   address  entry_point,
2270                                   int      number_of_arguments,
2271                                   bool     check_exceptions) {
2272   // determine java_thread register
2273   if (!java_thread->is_valid()) {
2274 #ifdef _LP64
2275     java_thread = r15_thread;
2276 #else
2277     java_thread = rdi;
2278     get_thread(java_thread);
2279 #endif // LP64
2280   }
2281   // determine last_java_sp register
2282   if (!last_java_sp->is_valid()) {
2283     last_java_sp = rsp;
2284   }
2285   // debugging support
2286   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
2287   LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
2288 #ifdef ASSERT
2289   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
2290   // r12 is the heapbase.
2291   LP64_ONLY(if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
2292 #endif // ASSERT
2293 
2294   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
2295   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
2296 
2297   // push java thread (becomes first argument of C function)
2298 
2299   NOT_LP64(push(java_thread); number_of_arguments++);
2300   LP64_ONLY(mov(c_rarg0, r15_thread));
2301 
2302   // set last Java frame before call
2303   assert(last_java_sp != rbp, "can't use ebp/rbp");
2304 
2305   // Only interpreter should have to set fp
2306   set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
2307 
2308   // do the call, remove parameters
2309   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
2310 
2311   // restore the thread (cannot use the pushed argument since arguments
2312   // may be overwritten by C code generated by an optimizing compiler);
2313   // however can use the register value directly if it is callee saved.
2314   if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
2315     // rdi & rsi (also r15) are callee saved -> nothing to do
2316 #ifdef ASSERT
2317     guarantee(java_thread != rax, "change this code");
2318     push(rax);
2319     { Label L;
2320       get_thread(rax);
2321       cmpptr(java_thread, rax);
2322       jcc(Assembler::equal, L);
2323       STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
2324       bind(L);
2325     }
2326     pop(rax);
2327 #endif
2328   } else {
2329     get_thread(java_thread);
2330   }
2331   // reset last Java frame
2332   // Only interpreter should have to clear fp
2333   reset_last_Java_frame(java_thread, true);
2334 
2335    // C++ interp handles this in the interpreter
2336   check_and_handle_popframe(java_thread);
2337   check_and_handle_earlyret(java_thread);
2338 
2339   if (check_exceptions) {
2340     // check for pending exceptions (java_thread is set upon return)
2341     cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
2342 #ifndef _LP64
2343     jump_cc(Assembler::notEqual,
2344             RuntimeAddress(StubRoutines::forward_exception_entry()));
2345 #else
2346     // This used to conditionally jump to forward_exception however it is
2347     // possible if we relocate that the branch will not reach. So we must jump
2348     // around so we can always reach
2349 
2350     Label ok;
2351     jcc(Assembler::equal, ok);
2352     jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2353     bind(ok);
2354 #endif // LP64
2355   }
2356 
2357   // get oop result if there is one and reset the value in the thread
2358   if (oop_result->is_valid()) {
2359     get_vm_result(oop_result, java_thread);
2360   }
2361 }
2362 
2363 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
2364 
2365   // Calculate the value for last_Java_sp
2366   // somewhat subtle. call_VM does an intermediate call
2367   // which places a return address on the stack just under the
2368   // stack pointer as the user finsihed with it. This allows
2369   // use to retrieve last_Java_pc from last_Java_sp[-1].
2370   // On 32bit we then have to push additional args on the stack to accomplish
2371   // the actual requested call. On 64bit call_VM only can use register args
2372   // so the only extra space is the return address that call_VM created.
2373   // This hopefully explains the calculations here.
2374 
2375 #ifdef _LP64
2376   // We've pushed one address, correct last_Java_sp
2377   lea(rax, Address(rsp, wordSize));
2378 #else
2379   lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
2380 #endif // LP64
2381 
2382   call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
2383 
2384 }
2385 
2386 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
2387 void MacroAssembler::call_VM_leaf0(address entry_point) {
2388   MacroAssembler::call_VM_leaf_base(entry_point, 0);
2389 }
2390 
2391 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
2392   call_VM_leaf_base(entry_point, number_of_arguments);
2393 }
2394 
2395 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
2396   pass_arg0(this, arg_0);
2397   call_VM_leaf(entry_point, 1);
2398 }
2399 
2400 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2401 
2402   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2403   pass_arg1(this, arg_1);
2404   pass_arg0(this, arg_0);
2405   call_VM_leaf(entry_point, 2);
2406 }
2407 
2408 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2409   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2410   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2411   pass_arg2(this, arg_2);
2412   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2413   pass_arg1(this, arg_1);
2414   pass_arg0(this, arg_0);
2415   call_VM_leaf(entry_point, 3);
2416 }
2417 
2418 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
2419   pass_arg0(this, arg_0);
2420   MacroAssembler::call_VM_leaf_base(entry_point, 1);
2421 }
2422 
2423 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2424 
2425   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2426   pass_arg1(this, arg_1);
2427   pass_arg0(this, arg_0);
2428   MacroAssembler::call_VM_leaf_base(entry_point, 2);
2429 }
2430 
2431 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2432   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2433   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2434   pass_arg2(this, arg_2);
2435   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2436   pass_arg1(this, arg_1);
2437   pass_arg0(this, arg_0);
2438   MacroAssembler::call_VM_leaf_base(entry_point, 3);
2439 }
2440 
2441 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
2442   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
2443   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2444   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2445   pass_arg3(this, arg_3);
2446   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2447   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2448   pass_arg2(this, arg_2);
2449   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2450   pass_arg1(this, arg_1);
2451   pass_arg0(this, arg_0);
2452   MacroAssembler::call_VM_leaf_base(entry_point, 4);
2453 }
2454 
2455 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
2456   movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
2457   movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
2458   verify_oop(oop_result, "broken oop in call_VM_base");
2459 }
2460 
2461 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
2462   movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
2463   movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
2464 }
2465 
2466 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
2467 }
2468 
2469 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
2470 }
2471 
2472 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
2473   if (reachable(src1)) {
2474     cmpl(as_Address(src1), imm);
2475   } else {
2476     lea(rscratch1, src1);
2477     cmpl(Address(rscratch1, 0), imm);
2478   }
2479 }
2480 
2481 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
2482   assert(!src2.is_lval(), "use cmpptr");
2483   if (reachable(src2)) {
2484     cmpl(src1, as_Address(src2));
2485   } else {
2486     lea(rscratch1, src2);
2487     cmpl(src1, Address(rscratch1, 0));
2488   }
2489 }
2490 
2491 void MacroAssembler::cmp32(Register src1, int32_t imm) {
2492   Assembler::cmpl(src1, imm);
2493 }
2494 
2495 void MacroAssembler::cmp32(Register src1, Address src2) {
2496   Assembler::cmpl(src1, src2);
2497 }
2498 
2499 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2500   ucomisd(opr1, opr2);
2501 
2502   Label L;
2503   if (unordered_is_less) {
2504     movl(dst, -1);
2505     jcc(Assembler::parity, L);
2506     jcc(Assembler::below , L);
2507     movl(dst, 0);
2508     jcc(Assembler::equal , L);
2509     increment(dst);
2510   } else { // unordered is greater
2511     movl(dst, 1);
2512     jcc(Assembler::parity, L);
2513     jcc(Assembler::above , L);
2514     movl(dst, 0);
2515     jcc(Assembler::equal , L);
2516     decrementl(dst);
2517   }
2518   bind(L);
2519 }
2520 
2521 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2522   ucomiss(opr1, opr2);
2523 
2524   Label L;
2525   if (unordered_is_less) {
2526     movl(dst, -1);
2527     jcc(Assembler::parity, L);
2528     jcc(Assembler::below , L);
2529     movl(dst, 0);
2530     jcc(Assembler::equal , L);
2531     increment(dst);
2532   } else { // unordered is greater
2533     movl(dst, 1);
2534     jcc(Assembler::parity, L);
2535     jcc(Assembler::above , L);
2536     movl(dst, 0);
2537     jcc(Assembler::equal , L);
2538     decrementl(dst);
2539   }
2540   bind(L);
2541 }
2542 
2543 
2544 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
2545   if (reachable(src1)) {
2546     cmpb(as_Address(src1), imm);
2547   } else {
2548     lea(rscratch1, src1);
2549     cmpb(Address(rscratch1, 0), imm);
2550   }
2551 }
2552 
2553 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
2554 #ifdef _LP64
2555   if (src2.is_lval()) {
2556     movptr(rscratch1, src2);
2557     Assembler::cmpq(src1, rscratch1);
2558   } else if (reachable(src2)) {
2559     cmpq(src1, as_Address(src2));
2560   } else {
2561     lea(rscratch1, src2);
2562     Assembler::cmpq(src1, Address(rscratch1, 0));
2563   }
2564 #else
2565   if (src2.is_lval()) {
2566     cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2567   } else {
2568     cmpl(src1, as_Address(src2));
2569   }
2570 #endif // _LP64
2571 }
2572 
2573 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
2574   assert(src2.is_lval(), "not a mem-mem compare");
2575 #ifdef _LP64
2576   // moves src2's literal address
2577   movptr(rscratch1, src2);
2578   Assembler::cmpq(src1, rscratch1);
2579 #else
2580   cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2581 #endif // _LP64
2582 }
2583 
2584 void MacroAssembler::cmpoop(Register src1, Register src2) {
2585   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2586   bs->obj_equals(this, src1, src2);
2587 }
2588 
2589 void MacroAssembler::cmpoop(Register src1, Address src2) {
2590   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2591   bs->obj_equals(this, src1, src2);
2592 }
2593 
2594 #ifdef _LP64
2595 void MacroAssembler::cmpoop(Register src1, jobject src2) {
2596   movoop(rscratch1, src2);
2597   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2598   bs->obj_equals(this, src1, rscratch1);
2599 }
2600 #endif
2601 
2602 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
2603   if (reachable(adr)) {
2604     lock();
2605     cmpxchgptr(reg, as_Address(adr));
2606   } else {
2607     lea(rscratch1, adr);
2608     lock();
2609     cmpxchgptr(reg, Address(rscratch1, 0));
2610   }
2611 }
2612 
2613 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
2614   LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
2615 }
2616 
2617 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
2618   if (reachable(src)) {
2619     Assembler::comisd(dst, as_Address(src));
2620   } else {
2621     lea(rscratch1, src);
2622     Assembler::comisd(dst, Address(rscratch1, 0));
2623   }
2624 }
2625 
2626 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
2627   if (reachable(src)) {
2628     Assembler::comiss(dst, as_Address(src));
2629   } else {
2630     lea(rscratch1, src);
2631     Assembler::comiss(dst, Address(rscratch1, 0));
2632   }
2633 }
2634 
2635 
2636 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
2637   Condition negated_cond = negate_condition(cond);
2638   Label L;
2639   jcc(negated_cond, L);
2640   pushf(); // Preserve flags
2641   atomic_incl(counter_addr);
2642   popf();
2643   bind(L);
2644 }
2645 
2646 int MacroAssembler::corrected_idivl(Register reg) {
2647   // Full implementation of Java idiv and irem; checks for
2648   // special case as described in JVM spec., p.243 & p.271.
2649   // The function returns the (pc) offset of the idivl
2650   // instruction - may be needed for implicit exceptions.
2651   //
2652   //         normal case                           special case
2653   //
2654   // input : rax,: dividend                         min_int
2655   //         reg: divisor   (may not be rax,/rdx)   -1
2656   //
2657   // output: rax,: quotient  (= rax, idiv reg)       min_int
2658   //         rdx: remainder (= rax, irem reg)       0
2659   assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
2660   const int min_int = 0x80000000;
2661   Label normal_case, special_case;
2662 
2663   // check for special case
2664   cmpl(rax, min_int);
2665   jcc(Assembler::notEqual, normal_case);
2666   xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
2667   cmpl(reg, -1);
2668   jcc(Assembler::equal, special_case);
2669 
2670   // handle normal case
2671   bind(normal_case);
2672   cdql();
2673   int idivl_offset = offset();
2674   idivl(reg);
2675 
2676   // normal and special case exit
2677   bind(special_case);
2678 
2679   return idivl_offset;
2680 }
2681 
2682 
2683 
2684 void MacroAssembler::decrementl(Register reg, int value) {
2685   if (value == min_jint) {subl(reg, value) ; return; }
2686   if (value <  0) { incrementl(reg, -value); return; }
2687   if (value == 0) {                        ; return; }
2688   if (value == 1 && UseIncDec) { decl(reg) ; return; }
2689   /* else */      { subl(reg, value)       ; return; }
2690 }
2691 
2692 void MacroAssembler::decrementl(Address dst, int value) {
2693   if (value == min_jint) {subl(dst, value) ; return; }
2694   if (value <  0) { incrementl(dst, -value); return; }
2695   if (value == 0) {                        ; return; }
2696   if (value == 1 && UseIncDec) { decl(dst) ; return; }
2697   /* else */      { subl(dst, value)       ; return; }
2698 }
2699 
2700 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
2701   assert (shift_value > 0, "illegal shift value");
2702   Label _is_positive;
2703   testl (reg, reg);
2704   jcc (Assembler::positive, _is_positive);
2705   int offset = (1 << shift_value) - 1 ;
2706 
2707   if (offset == 1) {
2708     incrementl(reg);
2709   } else {
2710     addl(reg, offset);
2711   }
2712 
2713   bind (_is_positive);
2714   sarl(reg, shift_value);
2715 }
2716 
2717 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
2718   if (reachable(src)) {
2719     Assembler::divsd(dst, as_Address(src));
2720   } else {
2721     lea(rscratch1, src);
2722     Assembler::divsd(dst, Address(rscratch1, 0));
2723   }
2724 }
2725 
2726 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
2727   if (reachable(src)) {
2728     Assembler::divss(dst, as_Address(src));
2729   } else {
2730     lea(rscratch1, src);
2731     Assembler::divss(dst, Address(rscratch1, 0));
2732   }
2733 }
2734 
2735 // !defined(COMPILER2) is because of stupid core builds
2736 #if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2) || INCLUDE_JVMCI
2737 void MacroAssembler::empty_FPU_stack() {
2738   if (VM_Version::supports_mmx()) {
2739     emms();
2740   } else {
2741     for (int i = 8; i-- > 0; ) ffree(i);
2742   }
2743 }
2744 #endif // !LP64 || C1 || !C2 || INCLUDE_JVMCI
2745 
2746 
2747 void MacroAssembler::enter() {
2748   push(rbp);
2749   mov(rbp, rsp);
2750 }
2751 
2752 // A 5 byte nop that is safe for patching (see patch_verified_entry)
2753 void MacroAssembler::fat_nop() {
2754   if (UseAddressNop) {
2755     addr_nop_5();
2756   } else {
2757     emit_int8(0x26); // es:
2758     emit_int8(0x2e); // cs:
2759     emit_int8(0x64); // fs:
2760     emit_int8(0x65); // gs:
2761     emit_int8((unsigned char)0x90);
2762   }
2763 }
2764 
2765 void MacroAssembler::fcmp(Register tmp) {
2766   fcmp(tmp, 1, true, true);
2767 }
2768 
2769 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
2770   assert(!pop_right || pop_left, "usage error");
2771   if (VM_Version::supports_cmov()) {
2772     assert(tmp == noreg, "unneeded temp");
2773     if (pop_left) {
2774       fucomip(index);
2775     } else {
2776       fucomi(index);
2777     }
2778     if (pop_right) {
2779       fpop();
2780     }
2781   } else {
2782     assert(tmp != noreg, "need temp");
2783     if (pop_left) {
2784       if (pop_right) {
2785         fcompp();
2786       } else {
2787         fcomp(index);
2788       }
2789     } else {
2790       fcom(index);
2791     }
2792     // convert FPU condition into eflags condition via rax,
2793     save_rax(tmp);
2794     fwait(); fnstsw_ax();
2795     sahf();
2796     restore_rax(tmp);
2797   }
2798   // condition codes set as follows:
2799   //
2800   // CF (corresponds to C0) if x < y
2801   // PF (corresponds to C2) if unordered
2802   // ZF (corresponds to C3) if x = y
2803 }
2804 
2805 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
2806   fcmp2int(dst, unordered_is_less, 1, true, true);
2807 }
2808 
2809 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
2810   fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
2811   Label L;
2812   if (unordered_is_less) {
2813     movl(dst, -1);
2814     jcc(Assembler::parity, L);
2815     jcc(Assembler::below , L);
2816     movl(dst, 0);
2817     jcc(Assembler::equal , L);
2818     increment(dst);
2819   } else { // unordered is greater
2820     movl(dst, 1);
2821     jcc(Assembler::parity, L);
2822     jcc(Assembler::above , L);
2823     movl(dst, 0);
2824     jcc(Assembler::equal , L);
2825     decrementl(dst);
2826   }
2827   bind(L);
2828 }
2829 
2830 void MacroAssembler::fld_d(AddressLiteral src) {
2831   fld_d(as_Address(src));
2832 }
2833 
2834 void MacroAssembler::fld_s(AddressLiteral src) {
2835   fld_s(as_Address(src));
2836 }
2837 
2838 void MacroAssembler::fld_x(AddressLiteral src) {
2839   Assembler::fld_x(as_Address(src));
2840 }
2841 
2842 void MacroAssembler::fldcw(AddressLiteral src) {
2843   Assembler::fldcw(as_Address(src));
2844 }
2845 
2846 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
2847   if (reachable(src)) {
2848     Assembler::mulpd(dst, as_Address(src));
2849   } else {
2850     lea(rscratch1, src);
2851     Assembler::mulpd(dst, Address(rscratch1, 0));
2852   }
2853 }
2854 
2855 void MacroAssembler::increase_precision() {
2856   subptr(rsp, BytesPerWord);
2857   fnstcw(Address(rsp, 0));
2858   movl(rax, Address(rsp, 0));
2859   orl(rax, 0x300);
2860   push(rax);
2861   fldcw(Address(rsp, 0));
2862   pop(rax);
2863 }
2864 
2865 void MacroAssembler::restore_precision() {
2866   fldcw(Address(rsp, 0));
2867   addptr(rsp, BytesPerWord);
2868 }
2869 
2870 void MacroAssembler::fpop() {
2871   ffree();
2872   fincstp();
2873 }
2874 
2875 void MacroAssembler::load_float(Address src) {
2876   if (UseSSE >= 1) {
2877     movflt(xmm0, src);
2878   } else {
2879     LP64_ONLY(ShouldNotReachHere());
2880     NOT_LP64(fld_s(src));
2881   }
2882 }
2883 
2884 void MacroAssembler::store_float(Address dst) {
2885   if (UseSSE >= 1) {
2886     movflt(dst, xmm0);
2887   } else {
2888     LP64_ONLY(ShouldNotReachHere());
2889     NOT_LP64(fstp_s(dst));
2890   }
2891 }
2892 
2893 void MacroAssembler::load_double(Address src) {
2894   if (UseSSE >= 2) {
2895     movdbl(xmm0, src);
2896   } else {
2897     LP64_ONLY(ShouldNotReachHere());
2898     NOT_LP64(fld_d(src));
2899   }
2900 }
2901 
2902 void MacroAssembler::store_double(Address dst) {
2903   if (UseSSE >= 2) {
2904     movdbl(dst, xmm0);
2905   } else {
2906     LP64_ONLY(ShouldNotReachHere());
2907     NOT_LP64(fstp_d(dst));
2908   }
2909 }
2910 
2911 void MacroAssembler::fremr(Register tmp) {
2912   save_rax(tmp);
2913   { Label L;
2914     bind(L);
2915     fprem();
2916     fwait(); fnstsw_ax();
2917 #ifdef _LP64
2918     testl(rax, 0x400);
2919     jcc(Assembler::notEqual, L);
2920 #else
2921     sahf();
2922     jcc(Assembler::parity, L);
2923 #endif // _LP64
2924   }
2925   restore_rax(tmp);
2926   // Result is in ST0.
2927   // Note: fxch & fpop to get rid of ST1
2928   // (otherwise FPU stack could overflow eventually)
2929   fxch(1);
2930   fpop();
2931 }
2932 
2933 // dst = c = a * b + c
2934 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2935   Assembler::vfmadd231sd(c, a, b);
2936   if (dst != c) {
2937     movdbl(dst, c);
2938   }
2939 }
2940 
2941 // dst = c = a * b + c
2942 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2943   Assembler::vfmadd231ss(c, a, b);
2944   if (dst != c) {
2945     movflt(dst, c);
2946   }
2947 }
2948 
2949 // dst = c = a * b + c
2950 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2951   Assembler::vfmadd231pd(c, a, b, vector_len);
2952   if (dst != c) {
2953     vmovdqu(dst, c);
2954   }
2955 }
2956 
2957 // dst = c = a * b + c
2958 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2959   Assembler::vfmadd231ps(c, a, b, vector_len);
2960   if (dst != c) {
2961     vmovdqu(dst, c);
2962   }
2963 }
2964 
2965 // dst = c = a * b + c
2966 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2967   Assembler::vfmadd231pd(c, a, b, vector_len);
2968   if (dst != c) {
2969     vmovdqu(dst, c);
2970   }
2971 }
2972 
2973 // dst = c = a * b + c
2974 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2975   Assembler::vfmadd231ps(c, a, b, vector_len);
2976   if (dst != c) {
2977     vmovdqu(dst, c);
2978   }
2979 }
2980 
2981 void MacroAssembler::incrementl(AddressLiteral dst) {
2982   if (reachable(dst)) {
2983     incrementl(as_Address(dst));
2984   } else {
2985     lea(rscratch1, dst);
2986     incrementl(Address(rscratch1, 0));
2987   }
2988 }
2989 
2990 void MacroAssembler::incrementl(ArrayAddress dst) {
2991   incrementl(as_Address(dst));
2992 }
2993 
2994 void MacroAssembler::incrementl(Register reg, int value) {
2995   if (value == min_jint) {addl(reg, value) ; return; }
2996   if (value <  0) { decrementl(reg, -value); return; }
2997   if (value == 0) {                        ; return; }
2998   if (value == 1 && UseIncDec) { incl(reg) ; return; }
2999   /* else */      { addl(reg, value)       ; return; }
3000 }
3001 
3002 void MacroAssembler::incrementl(Address dst, int value) {
3003   if (value == min_jint) {addl(dst, value) ; return; }
3004   if (value <  0) { decrementl(dst, -value); return; }
3005   if (value == 0) {                        ; return; }
3006   if (value == 1 && UseIncDec) { incl(dst) ; return; }
3007   /* else */      { addl(dst, value)       ; return; }
3008 }
3009 
3010 void MacroAssembler::jump(AddressLiteral dst) {
3011   if (reachable(dst)) {
3012     jmp_literal(dst.target(), dst.rspec());
3013   } else {
3014     lea(rscratch1, dst);
3015     jmp(rscratch1);
3016   }
3017 }
3018 
3019 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
3020   if (reachable(dst)) {
3021     InstructionMark im(this);
3022     relocate(dst.reloc());
3023     const int short_size = 2;
3024     const int long_size = 6;
3025     int offs = (intptr_t)dst.target() - ((intptr_t)pc());
3026     if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
3027       // 0111 tttn #8-bit disp
3028       emit_int8(0x70 | cc);
3029       emit_int8((offs - short_size) & 0xFF);
3030     } else {
3031       // 0000 1111 1000 tttn #32-bit disp
3032       emit_int8(0x0F);
3033       emit_int8((unsigned char)(0x80 | cc));
3034       emit_int32(offs - long_size);
3035     }
3036   } else {
3037 #ifdef ASSERT
3038     warning("reversing conditional branch");
3039 #endif /* ASSERT */
3040     Label skip;
3041     jccb(reverse[cc], skip);
3042     lea(rscratch1, dst);
3043     Assembler::jmp(rscratch1);
3044     bind(skip);
3045   }
3046 }
3047 
3048 void MacroAssembler::ldmxcsr(AddressLiteral src) {
3049   if (reachable(src)) {
3050     Assembler::ldmxcsr(as_Address(src));
3051   } else {
3052     lea(rscratch1, src);
3053     Assembler::ldmxcsr(Address(rscratch1, 0));
3054   }
3055 }
3056 
3057 int MacroAssembler::load_signed_byte(Register dst, Address src) {
3058   int off;
3059   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3060     off = offset();
3061     movsbl(dst, src); // movsxb
3062   } else {
3063     off = load_unsigned_byte(dst, src);
3064     shll(dst, 24);
3065     sarl(dst, 24);
3066   }
3067   return off;
3068 }
3069 
3070 // Note: load_signed_short used to be called load_signed_word.
3071 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
3072 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
3073 // The term "word" in HotSpot means a 32- or 64-bit machine word.
3074 int MacroAssembler::load_signed_short(Register dst, Address src) {
3075   int off;
3076   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3077     // This is dubious to me since it seems safe to do a signed 16 => 64 bit
3078     // version but this is what 64bit has always done. This seems to imply
3079     // that users are only using 32bits worth.
3080     off = offset();
3081     movswl(dst, src); // movsxw
3082   } else {
3083     off = load_unsigned_short(dst, src);
3084     shll(dst, 16);
3085     sarl(dst, 16);
3086   }
3087   return off;
3088 }
3089 
3090 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
3091   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3092   // and "3.9 Partial Register Penalties", p. 22).
3093   int off;
3094   if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
3095     off = offset();
3096     movzbl(dst, src); // movzxb
3097   } else {
3098     xorl(dst, dst);
3099     off = offset();
3100     movb(dst, src);
3101   }
3102   return off;
3103 }
3104 
3105 // Note: load_unsigned_short used to be called load_unsigned_word.
3106 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
3107   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3108   // and "3.9 Partial Register Penalties", p. 22).
3109   int off;
3110   if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
3111     off = offset();
3112     movzwl(dst, src); // movzxw
3113   } else {
3114     xorl(dst, dst);
3115     off = offset();
3116     movw(dst, src);
3117   }
3118   return off;
3119 }
3120 
3121 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
3122   switch (size_in_bytes) {
3123 #ifndef _LP64
3124   case  8:
3125     assert(dst2 != noreg, "second dest register required");
3126     movl(dst,  src);
3127     movl(dst2, src.plus_disp(BytesPerInt));
3128     break;
3129 #else
3130   case  8:  movq(dst, src); break;
3131 #endif
3132   case  4:  movl(dst, src); break;
3133   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
3134   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
3135   default:  ShouldNotReachHere();
3136   }
3137 }
3138 
3139 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
3140   switch (size_in_bytes) {
3141 #ifndef _LP64
3142   case  8:
3143     assert(src2 != noreg, "second source register required");
3144     movl(dst,                        src);
3145     movl(dst.plus_disp(BytesPerInt), src2);
3146     break;
3147 #else
3148   case  8:  movq(dst, src); break;
3149 #endif
3150   case  4:  movl(dst, src); break;
3151   case  2:  movw(dst, src); break;
3152   case  1:  movb(dst, src); break;
3153   default:  ShouldNotReachHere();
3154   }
3155 }
3156 
3157 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
3158   if (reachable(dst)) {
3159     movl(as_Address(dst), src);
3160   } else {
3161     lea(rscratch1, dst);
3162     movl(Address(rscratch1, 0), src);
3163   }
3164 }
3165 
3166 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
3167   if (reachable(src)) {
3168     movl(dst, as_Address(src));
3169   } else {
3170     lea(rscratch1, src);
3171     movl(dst, Address(rscratch1, 0));
3172   }
3173 }
3174 
3175 // C++ bool manipulation
3176 
3177 void MacroAssembler::movbool(Register dst, Address src) {
3178   if(sizeof(bool) == 1)
3179     movb(dst, src);
3180   else if(sizeof(bool) == 2)
3181     movw(dst, src);
3182   else if(sizeof(bool) == 4)
3183     movl(dst, src);
3184   else
3185     // unsupported
3186     ShouldNotReachHere();
3187 }
3188 
3189 void MacroAssembler::movbool(Address dst, bool boolconst) {
3190   if(sizeof(bool) == 1)
3191     movb(dst, (int) boolconst);
3192   else if(sizeof(bool) == 2)
3193     movw(dst, (int) boolconst);
3194   else if(sizeof(bool) == 4)
3195     movl(dst, (int) boolconst);
3196   else
3197     // unsupported
3198     ShouldNotReachHere();
3199 }
3200 
3201 void MacroAssembler::movbool(Address dst, Register src) {
3202   if(sizeof(bool) == 1)
3203     movb(dst, src);
3204   else if(sizeof(bool) == 2)
3205     movw(dst, src);
3206   else if(sizeof(bool) == 4)
3207     movl(dst, src);
3208   else
3209     // unsupported
3210     ShouldNotReachHere();
3211 }
3212 
3213 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
3214   movb(as_Address(dst), src);
3215 }
3216 
3217 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
3218   if (reachable(src)) {
3219     movdl(dst, as_Address(src));
3220   } else {
3221     lea(rscratch1, src);
3222     movdl(dst, Address(rscratch1, 0));
3223   }
3224 }
3225 
3226 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
3227   if (reachable(src)) {
3228     movq(dst, as_Address(src));
3229   } else {
3230     lea(rscratch1, src);
3231     movq(dst, Address(rscratch1, 0));
3232   }
3233 }
3234 
3235 #ifdef COMPILER2
3236 void MacroAssembler::setvectmask(Register dst, Register src) {
3237   guarantee(PostLoopMultiversioning, "must be");
3238   Assembler::movl(dst, 1);
3239   Assembler::shlxl(dst, dst, src);
3240   Assembler::decl(dst);
3241   Assembler::kmovdl(k1, dst);
3242   Assembler::movl(dst, src);
3243 }
3244 
3245 void MacroAssembler::restorevectmask() {
3246   guarantee(PostLoopMultiversioning, "must be");
3247   Assembler::knotwl(k1, k0);
3248 }
3249 #endif // COMPILER2
3250 
3251 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
3252   if (reachable(src)) {
3253     if (UseXmmLoadAndClearUpper) {
3254       movsd (dst, as_Address(src));
3255     } else {
3256       movlpd(dst, as_Address(src));
3257     }
3258   } else {
3259     lea(rscratch1, src);
3260     if (UseXmmLoadAndClearUpper) {
3261       movsd (dst, Address(rscratch1, 0));
3262     } else {
3263       movlpd(dst, Address(rscratch1, 0));
3264     }
3265   }
3266 }
3267 
3268 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
3269   if (reachable(src)) {
3270     movss(dst, as_Address(src));
3271   } else {
3272     lea(rscratch1, src);
3273     movss(dst, Address(rscratch1, 0));
3274   }
3275 }
3276 
3277 void MacroAssembler::movptr(Register dst, Register src) {
3278   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3279 }
3280 
3281 void MacroAssembler::movptr(Register dst, Address src) {
3282   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3283 }
3284 
3285 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
3286 void MacroAssembler::movptr(Register dst, intptr_t src) {
3287   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
3288 }
3289 
3290 void MacroAssembler::movptr(Address dst, Register src) {
3291   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3292 }
3293 
3294 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
3295     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3296     Assembler::movdqu(dst, src);
3297 }
3298 
3299 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
3300     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3301     Assembler::movdqu(dst, src);
3302 }
3303 
3304 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
3305     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3306     Assembler::movdqu(dst, src);
3307 }
3308 
3309 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
3310   if (reachable(src)) {
3311     movdqu(dst, as_Address(src));
3312   } else {
3313     lea(scratchReg, src);
3314     movdqu(dst, Address(scratchReg, 0));
3315   }
3316 }
3317 
3318 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
3319     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3320     Assembler::vmovdqu(dst, src);
3321 }
3322 
3323 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
3324     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3325     Assembler::vmovdqu(dst, src);
3326 }
3327 
3328 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
3329     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3330     Assembler::vmovdqu(dst, src);
3331 }
3332 
3333 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3334   if (reachable(src)) {
3335     vmovdqu(dst, as_Address(src));
3336   }
3337   else {
3338     lea(scratch_reg, src);
3339     vmovdqu(dst, Address(scratch_reg, 0));
3340   }
3341 }
3342 
3343 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3344   if (reachable(src)) {
3345     Assembler::evmovdquq(dst, as_Address(src), vector_len);
3346   } else {
3347     lea(rscratch, src);
3348     Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
3349   }
3350 }
3351 
3352 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
3353   if (reachable(src)) {
3354     Assembler::movdqa(dst, as_Address(src));
3355   } else {
3356     lea(rscratch1, src);
3357     Assembler::movdqa(dst, Address(rscratch1, 0));
3358   }
3359 }
3360 
3361 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
3362   if (reachable(src)) {
3363     Assembler::movsd(dst, as_Address(src));
3364   } else {
3365     lea(rscratch1, src);
3366     Assembler::movsd(dst, Address(rscratch1, 0));
3367   }
3368 }
3369 
3370 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
3371   if (reachable(src)) {
3372     Assembler::movss(dst, as_Address(src));
3373   } else {
3374     lea(rscratch1, src);
3375     Assembler::movss(dst, Address(rscratch1, 0));
3376   }
3377 }
3378 
3379 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
3380   if (reachable(src)) {
3381     Assembler::mulsd(dst, as_Address(src));
3382   } else {
3383     lea(rscratch1, src);
3384     Assembler::mulsd(dst, Address(rscratch1, 0));
3385   }
3386 }
3387 
3388 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
3389   if (reachable(src)) {
3390     Assembler::mulss(dst, as_Address(src));
3391   } else {
3392     lea(rscratch1, src);
3393     Assembler::mulss(dst, Address(rscratch1, 0));
3394   }
3395 }
3396 
3397 void MacroAssembler::null_check(Register reg, int offset) {
3398   if (needs_explicit_null_check(offset)) {
3399     // provoke OS NULL exception if reg = NULL by
3400     // accessing M[reg] w/o changing any (non-CC) registers
3401     // NOTE: cmpl is plenty here to provoke a segv
3402     cmpptr(rax, Address(reg, 0));
3403     // Note: should probably use testl(rax, Address(reg, 0));
3404     //       may be shorter code (however, this version of
3405     //       testl needs to be implemented first)
3406   } else {
3407     // nothing to do, (later) access of M[reg + offset]
3408     // will provoke OS NULL exception if reg = NULL
3409   }
3410 }
3411 
3412 void MacroAssembler::os_breakpoint() {
3413   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
3414   // (e.g., MSVC can't call ps() otherwise)
3415   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
3416 }
3417 
3418 void MacroAssembler::unimplemented(const char* what) {
3419   const char* buf = NULL;
3420   {
3421     ResourceMark rm;
3422     stringStream ss;
3423     ss.print("unimplemented: %s", what);
3424     buf = code_string(ss.as_string());
3425   }
3426   stop(buf);
3427 }
3428 
3429 #ifdef _LP64
3430 #define XSTATE_BV 0x200
3431 #endif
3432 
3433 void MacroAssembler::pop_CPU_state() {
3434   pop_FPU_state();
3435   pop_IU_state();
3436 }
3437 
3438 void MacroAssembler::pop_FPU_state() {
3439 #ifndef _LP64
3440   frstor(Address(rsp, 0));
3441 #else
3442   fxrstor(Address(rsp, 0));
3443 #endif
3444   addptr(rsp, FPUStateSizeInWords * wordSize);
3445 }
3446 
3447 void MacroAssembler::pop_IU_state() {
3448   popa();
3449   LP64_ONLY(addq(rsp, 8));
3450   popf();
3451 }
3452 
3453 // Save Integer and Float state
3454 // Warning: Stack must be 16 byte aligned (64bit)
3455 void MacroAssembler::push_CPU_state() {
3456   push_IU_state();
3457   push_FPU_state();
3458 }
3459 
3460 void MacroAssembler::push_FPU_state() {
3461   subptr(rsp, FPUStateSizeInWords * wordSize);
3462 #ifndef _LP64
3463   fnsave(Address(rsp, 0));
3464   fwait();
3465 #else
3466   fxsave(Address(rsp, 0));
3467 #endif // LP64
3468 }
3469 
3470 void MacroAssembler::push_IU_state() {
3471   // Push flags first because pusha kills them
3472   pushf();
3473   // Make sure rsp stays 16-byte aligned
3474   LP64_ONLY(subq(rsp, 8));
3475   pusha();
3476 }
3477 
3478 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register
3479   if (!java_thread->is_valid()) {
3480     java_thread = rdi;
3481     get_thread(java_thread);
3482   }
3483   // we must set sp to zero to clear frame
3484   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
3485   if (clear_fp) {
3486     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3487   }
3488 
3489   // Always clear the pc because it could have been set by make_walkable()
3490   movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
3491 
3492   vzeroupper();
3493 }
3494 
3495 void MacroAssembler::restore_rax(Register tmp) {
3496   if (tmp == noreg) pop(rax);
3497   else if (tmp != rax) mov(rax, tmp);
3498 }
3499 
3500 void MacroAssembler::round_to(Register reg, int modulus) {
3501   addptr(reg, modulus - 1);
3502   andptr(reg, -modulus);
3503 }
3504 
3505 void MacroAssembler::save_rax(Register tmp) {
3506   if (tmp == noreg) push(rax);
3507   else if (tmp != rax) mov(tmp, rax);
3508 }
3509 
3510 void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, Register temp_reg) {
3511   if (SafepointMechanism::uses_thread_local_poll()) {
3512 #ifdef _LP64
3513     assert(thread_reg == r15_thread, "should be");
3514 #else
3515     if (thread_reg == noreg) {
3516       thread_reg = temp_reg;
3517       get_thread(thread_reg);
3518     }
3519 #endif
3520     testb(Address(thread_reg, Thread::polling_page_offset()), SafepointMechanism::poll_bit());
3521     jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
3522   } else {
3523     cmp32(ExternalAddress(SafepointSynchronize::address_of_state()),
3524         SafepointSynchronize::_not_synchronized);
3525     jcc(Assembler::notEqual, slow_path);
3526   }
3527 }
3528 
3529 // Calls to C land
3530 //
3531 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
3532 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
3533 // has to be reset to 0. This is required to allow proper stack traversal.
3534 void MacroAssembler::set_last_Java_frame(Register java_thread,
3535                                          Register last_java_sp,
3536                                          Register last_java_fp,
3537                                          address  last_java_pc) {
3538   vzeroupper();
3539   // determine java_thread register
3540   if (!java_thread->is_valid()) {
3541     java_thread = rdi;
3542     get_thread(java_thread);
3543   }
3544   // determine last_java_sp register
3545   if (!last_java_sp->is_valid()) {
3546     last_java_sp = rsp;
3547   }
3548 
3549   // last_java_fp is optional
3550 
3551   if (last_java_fp->is_valid()) {
3552     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
3553   }
3554 
3555   // last_java_pc is optional
3556 
3557   if (last_java_pc != NULL) {
3558     lea(Address(java_thread,
3559                  JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
3560         InternalAddress(last_java_pc));
3561 
3562   }
3563   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
3564 }
3565 
3566 void MacroAssembler::shlptr(Register dst, int imm8) {
3567   LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
3568 }
3569 
3570 void MacroAssembler::shrptr(Register dst, int imm8) {
3571   LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
3572 }
3573 
3574 void MacroAssembler::sign_extend_byte(Register reg) {
3575   if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
3576     movsbl(reg, reg); // movsxb
3577   } else {
3578     shll(reg, 24);
3579     sarl(reg, 24);
3580   }
3581 }
3582 
3583 void MacroAssembler::sign_extend_short(Register reg) {
3584   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3585     movswl(reg, reg); // movsxw
3586   } else {
3587     shll(reg, 16);
3588     sarl(reg, 16);
3589   }
3590 }
3591 
3592 void MacroAssembler::testl(Register dst, AddressLiteral src) {
3593   assert(reachable(src), "Address should be reachable");
3594   testl(dst, as_Address(src));
3595 }
3596 
3597 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
3598   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3599   Assembler::pcmpeqb(dst, src);
3600 }
3601 
3602 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
3603   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3604   Assembler::pcmpeqw(dst, src);
3605 }
3606 
3607 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
3608   assert((dst->encoding() < 16),"XMM register should be 0-15");
3609   Assembler::pcmpestri(dst, src, imm8);
3610 }
3611 
3612 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
3613   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3614   Assembler::pcmpestri(dst, src, imm8);
3615 }
3616 
3617 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
3618   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3619   Assembler::pmovzxbw(dst, src);
3620 }
3621 
3622 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
3623   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3624   Assembler::pmovzxbw(dst, src);
3625 }
3626 
3627 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
3628   assert((src->encoding() < 16),"XMM register should be 0-15");
3629   Assembler::pmovmskb(dst, src);
3630 }
3631 
3632 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
3633   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3634   Assembler::ptest(dst, src);
3635 }
3636 
3637 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
3638   if (reachable(src)) {
3639     Assembler::sqrtsd(dst, as_Address(src));
3640   } else {
3641     lea(rscratch1, src);
3642     Assembler::sqrtsd(dst, Address(rscratch1, 0));
3643   }
3644 }
3645 
3646 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
3647   if (reachable(src)) {
3648     Assembler::sqrtss(dst, as_Address(src));
3649   } else {
3650     lea(rscratch1, src);
3651     Assembler::sqrtss(dst, Address(rscratch1, 0));
3652   }
3653 }
3654 
3655 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
3656   if (reachable(src)) {
3657     Assembler::subsd(dst, as_Address(src));
3658   } else {
3659     lea(rscratch1, src);
3660     Assembler::subsd(dst, Address(rscratch1, 0));
3661   }
3662 }
3663 
3664 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
3665   if (reachable(src)) {
3666     Assembler::subss(dst, as_Address(src));
3667   } else {
3668     lea(rscratch1, src);
3669     Assembler::subss(dst, Address(rscratch1, 0));
3670   }
3671 }
3672 
3673 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
3674   if (reachable(src)) {
3675     Assembler::ucomisd(dst, as_Address(src));
3676   } else {
3677     lea(rscratch1, src);
3678     Assembler::ucomisd(dst, Address(rscratch1, 0));
3679   }
3680 }
3681 
3682 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
3683   if (reachable(src)) {
3684     Assembler::ucomiss(dst, as_Address(src));
3685   } else {
3686     lea(rscratch1, src);
3687     Assembler::ucomiss(dst, Address(rscratch1, 0));
3688   }
3689 }
3690 
3691 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3692   // Used in sign-bit flipping with aligned address.
3693   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3694   if (reachable(src)) {
3695     Assembler::xorpd(dst, as_Address(src));
3696   } else {
3697     lea(scratch_reg, src);
3698     Assembler::xorpd(dst, Address(scratch_reg, 0));
3699   }
3700 }
3701 
3702 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
3703   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3704     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3705   }
3706   else {
3707     Assembler::xorpd(dst, src);
3708   }
3709 }
3710 
3711 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
3712   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3713     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3714   } else {
3715     Assembler::xorps(dst, src);
3716   }
3717 }
3718 
3719 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3720   // Used in sign-bit flipping with aligned address.
3721   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3722   if (reachable(src)) {
3723     Assembler::xorps(dst, as_Address(src));
3724   } else {
3725     lea(scratch_reg, src);
3726     Assembler::xorps(dst, Address(scratch_reg, 0));
3727   }
3728 }
3729 
3730 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
3731   // Used in sign-bit flipping with aligned address.
3732   bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3733   assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3734   if (reachable(src)) {
3735     Assembler::pshufb(dst, as_Address(src));
3736   } else {
3737     lea(rscratch1, src);
3738     Assembler::pshufb(dst, Address(rscratch1, 0));
3739   }
3740 }
3741 
3742 // AVX 3-operands instructions
3743 
3744 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3745   if (reachable(src)) {
3746     vaddsd(dst, nds, as_Address(src));
3747   } else {
3748     lea(rscratch1, src);
3749     vaddsd(dst, nds, Address(rscratch1, 0));
3750   }
3751 }
3752 
3753 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3754   if (reachable(src)) {
3755     vaddss(dst, nds, as_Address(src));
3756   } else {
3757     lea(rscratch1, src);
3758     vaddss(dst, nds, Address(rscratch1, 0));
3759   }
3760 }
3761 
3762 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3763   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3764   vandps(dst, nds, negate_field, vector_len);
3765 }
3766 
3767 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3768   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3769   vandpd(dst, nds, negate_field, vector_len);
3770 }
3771 
3772 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3773   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3774   Assembler::vpaddb(dst, nds, src, vector_len);
3775 }
3776 
3777 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3778   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3779   Assembler::vpaddb(dst, nds, src, vector_len);
3780 }
3781 
3782 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3783   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3784   Assembler::vpaddw(dst, nds, src, vector_len);
3785 }
3786 
3787 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3788   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3789   Assembler::vpaddw(dst, nds, src, vector_len);
3790 }
3791 
3792 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3793   if (reachable(src)) {
3794     Assembler::vpand(dst, nds, as_Address(src), vector_len);
3795   } else {
3796     lea(scratch_reg, src);
3797     Assembler::vpand(dst, nds, Address(scratch_reg, 0), vector_len);
3798   }
3799 }
3800 
3801 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
3802   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3803   Assembler::vpbroadcastw(dst, src, vector_len);
3804 }
3805 
3806 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3807   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3808   Assembler::vpcmpeqb(dst, nds, src, vector_len);
3809 }
3810 
3811 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3812   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3813   Assembler::vpcmpeqw(dst, nds, src, vector_len);
3814 }
3815 
3816 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3817   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3818   Assembler::vpmovzxbw(dst, src, vector_len);
3819 }
3820 
3821 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src) {
3822   assert((src->encoding() < 16),"XMM register should be 0-15");
3823   Assembler::vpmovmskb(dst, src);
3824 }
3825 
3826 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3827   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3828   Assembler::vpmullw(dst, nds, src, vector_len);
3829 }
3830 
3831 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3832   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3833   Assembler::vpmullw(dst, nds, src, vector_len);
3834 }
3835 
3836 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3837   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3838   Assembler::vpsubb(dst, nds, src, vector_len);
3839 }
3840 
3841 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3842   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3843   Assembler::vpsubb(dst, nds, src, vector_len);
3844 }
3845 
3846 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3847   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3848   Assembler::vpsubw(dst, nds, src, vector_len);
3849 }
3850 
3851 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3852   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3853   Assembler::vpsubw(dst, nds, src, vector_len);
3854 }
3855 
3856 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3857   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3858   Assembler::vpsraw(dst, nds, shift, vector_len);
3859 }
3860 
3861 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3862   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3863   Assembler::vpsraw(dst, nds, shift, vector_len);
3864 }
3865 
3866 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3867   assert(UseAVX > 2,"");
3868   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3869      vector_len = 2;
3870   }
3871   Assembler::evpsraq(dst, nds, shift, vector_len);
3872 }
3873 
3874 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3875   assert(UseAVX > 2,"");
3876   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3877      vector_len = 2;
3878   }
3879   Assembler::evpsraq(dst, nds, shift, vector_len);
3880 }
3881 
3882 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3883   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3884   Assembler::vpsrlw(dst, nds, shift, vector_len);
3885 }
3886 
3887 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3888   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3889   Assembler::vpsrlw(dst, nds, shift, vector_len);
3890 }
3891 
3892 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3893   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3894   Assembler::vpsllw(dst, nds, shift, vector_len);
3895 }
3896 
3897 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3898   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3899   Assembler::vpsllw(dst, nds, shift, vector_len);
3900 }
3901 
3902 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3903   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3904   Assembler::vptest(dst, src);
3905 }
3906 
3907 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3908   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3909   Assembler::punpcklbw(dst, src);
3910 }
3911 
3912 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3913   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3914   Assembler::pshufd(dst, src, mode);
3915 }
3916 
3917 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3918   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3919   Assembler::pshuflw(dst, src, mode);
3920 }
3921 
3922 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3923   if (reachable(src)) {
3924     vandpd(dst, nds, as_Address(src), vector_len);
3925   } else {
3926     lea(scratch_reg, src);
3927     vandpd(dst, nds, Address(scratch_reg, 0), vector_len);
3928   }
3929 }
3930 
3931 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3932   if (reachable(src)) {
3933     vandps(dst, nds, as_Address(src), vector_len);
3934   } else {
3935     lea(scratch_reg, src);
3936     vandps(dst, nds, Address(scratch_reg, 0), vector_len);
3937   }
3938 }
3939 
3940 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3941   if (reachable(src)) {
3942     vdivsd(dst, nds, as_Address(src));
3943   } else {
3944     lea(rscratch1, src);
3945     vdivsd(dst, nds, Address(rscratch1, 0));
3946   }
3947 }
3948 
3949 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3950   if (reachable(src)) {
3951     vdivss(dst, nds, as_Address(src));
3952   } else {
3953     lea(rscratch1, src);
3954     vdivss(dst, nds, Address(rscratch1, 0));
3955   }
3956 }
3957 
3958 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3959   if (reachable(src)) {
3960     vmulsd(dst, nds, as_Address(src));
3961   } else {
3962     lea(rscratch1, src);
3963     vmulsd(dst, nds, Address(rscratch1, 0));
3964   }
3965 }
3966 
3967 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3968   if (reachable(src)) {
3969     vmulss(dst, nds, as_Address(src));
3970   } else {
3971     lea(rscratch1, src);
3972     vmulss(dst, nds, Address(rscratch1, 0));
3973   }
3974 }
3975 
3976 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3977   if (reachable(src)) {
3978     vsubsd(dst, nds, as_Address(src));
3979   } else {
3980     lea(rscratch1, src);
3981     vsubsd(dst, nds, Address(rscratch1, 0));
3982   }
3983 }
3984 
3985 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3986   if (reachable(src)) {
3987     vsubss(dst, nds, as_Address(src));
3988   } else {
3989     lea(rscratch1, src);
3990     vsubss(dst, nds, Address(rscratch1, 0));
3991   }
3992 }
3993 
3994 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3995   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3996   vxorps(dst, nds, src, Assembler::AVX_128bit);
3997 }
3998 
3999 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4000   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
4001   vxorpd(dst, nds, src, Assembler::AVX_128bit);
4002 }
4003 
4004 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
4005   if (reachable(src)) {
4006     vxorpd(dst, nds, as_Address(src), vector_len);
4007   } else {
4008     lea(scratch_reg, src);
4009     vxorpd(dst, nds, Address(scratch_reg, 0), vector_len);
4010   }
4011 }
4012 
4013 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
4014   if (reachable(src)) {
4015     vxorps(dst, nds, as_Address(src), vector_len);
4016   } else {
4017     lea(scratch_reg, src);
4018     vxorps(dst, nds, Address(scratch_reg, 0), vector_len);
4019   }
4020 }
4021 
4022 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
4023   if (UseAVX > 1 || (vector_len < 1)) {
4024     if (reachable(src)) {
4025       Assembler::vpxor(dst, nds, as_Address(src), vector_len);
4026     } else {
4027       lea(scratch_reg, src);
4028       Assembler::vpxor(dst, nds, Address(scratch_reg, 0), vector_len);
4029     }
4030   }
4031   else {
4032     MacroAssembler::vxorpd(dst, nds, src, vector_len, scratch_reg);
4033   }
4034 }
4035 
4036 //-------------------------------------------------------------------------------------------
4037 #ifdef COMPILER2
4038 // Generic instructions support for use in .ad files C2 code generation
4039 
4040 void MacroAssembler::vabsnegd(int opcode, XMMRegister dst, Register scr) {
4041   if (opcode == Op_AbsVD) {
4042     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
4043   } else {
4044     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
4045     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
4046   }
4047 }
4048 
4049 void MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
4050   if (opcode == Op_AbsVD) {
4051     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
4052   } else {
4053     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
4054     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
4055   }
4056 }
4057 
4058 void MacroAssembler::vabsnegf(int opcode, XMMRegister dst, Register scr) {
4059   if (opcode == Op_AbsVF) {
4060     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
4061   } else {
4062     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
4063     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
4064   }
4065 }
4066 
4067 void MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
4068   if (opcode == Op_AbsVF) {
4069     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
4070   } else {
4071     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
4072     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
4073   }
4074 }
4075 
4076 void MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
4077   if (sign) {
4078     pmovsxbw(dst, src);
4079   } else {
4080     pmovzxbw(dst, src);
4081   }
4082 }
4083 
4084 void MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
4085   if (sign) {
4086     vpmovsxbw(dst, src, vector_len);
4087   } else {
4088     vpmovzxbw(dst, src, vector_len);
4089   }
4090 }
4091 
4092 void MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src) {
4093   if (opcode == Op_RShiftVI) {
4094     psrad(dst, src);
4095   } else if (opcode == Op_LShiftVI) {
4096     pslld(dst, src);
4097   } else {
4098     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
4099     psrld(dst, src);
4100   }
4101 }
4102 
4103 void MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4104   if (opcode == Op_RShiftVI) {
4105     vpsrad(dst, nds, src, vector_len);
4106   } else if (opcode == Op_LShiftVI) {
4107     vpslld(dst, nds, src, vector_len);
4108   } else {
4109     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
4110     vpsrld(dst, nds, src, vector_len);
4111   }
4112 }
4113 
4114 void MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src) {
4115   if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) {
4116     psraw(dst, src);
4117   } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) {
4118     psllw(dst, src);
4119   } else {
4120     assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB");
4121     psrlw(dst, src);
4122   }
4123 }
4124 
4125 void MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4126   if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) {
4127     vpsraw(dst, nds, src, vector_len);
4128   } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) {
4129     vpsllw(dst, nds, src, vector_len);
4130   } else {
4131     assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB");
4132     vpsrlw(dst, nds, src, vector_len);
4133   }
4134 }
4135 
4136 void MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src) {
4137   if (opcode == Op_RShiftVL) {
4138     psrlq(dst, src);  // using srl to implement sra on pre-avs512 systems
4139   } else if (opcode == Op_LShiftVL) {
4140     psllq(dst, src);
4141   } else {
4142     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
4143     psrlq(dst, src);
4144   }
4145 }
4146 
4147 void MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4148   if (opcode == Op_RShiftVL) {
4149     evpsraq(dst, nds, src, vector_len);
4150   } else if (opcode == Op_LShiftVL) {
4151     vpsllq(dst, nds, src, vector_len);
4152   } else {
4153     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
4154     vpsrlq(dst, nds, src, vector_len);
4155   }
4156 }
4157 #endif
4158 //-------------------------------------------------------------------------------------------
4159 
4160 void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
4161   const int32_t inverted_jweak_mask = ~static_cast<int32_t>(JNIHandles::weak_tag_mask);
4162   STATIC_ASSERT(inverted_jweak_mask == -2); // otherwise check this code
4163   // The inverted mask is sign-extended
4164   andptr(possibly_jweak, inverted_jweak_mask);
4165 }
4166 
4167 void MacroAssembler::resolve_jobject(Register value,
4168                                      Register thread,
4169                                      Register tmp) {
4170   assert_different_registers(value, thread, tmp);
4171   Label done, not_weak;
4172   testptr(value, value);
4173   jcc(Assembler::zero, done);                // Use NULL as-is.
4174   testptr(value, JNIHandles::weak_tag_mask); // Test for jweak tag.
4175   jcc(Assembler::zero, not_weak);
4176   // Resolve jweak.
4177   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
4178                  value, Address(value, -JNIHandles::weak_tag_value), tmp, thread);
4179   verify_oop(value);
4180   jmp(done);
4181   bind(not_weak);
4182   // Resolve (untagged) jobject.
4183   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
4184   verify_oop(value);
4185   bind(done);
4186 }
4187 
4188 void MacroAssembler::subptr(Register dst, int32_t imm32) {
4189   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
4190 }
4191 
4192 // Force generation of a 4 byte immediate value even if it fits into 8bit
4193 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
4194   LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
4195 }
4196 
4197 void MacroAssembler::subptr(Register dst, Register src) {
4198   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
4199 }
4200 
4201 // C++ bool manipulation
4202 void MacroAssembler::testbool(Register dst) {
4203   if(sizeof(bool) == 1)
4204     testb(dst, 0xff);
4205   else if(sizeof(bool) == 2) {
4206     // testw implementation needed for two byte bools
4207     ShouldNotReachHere();
4208   } else if(sizeof(bool) == 4)
4209     testl(dst, dst);
4210   else
4211     // unsupported
4212     ShouldNotReachHere();
4213 }
4214 
4215 void MacroAssembler::testptr(Register dst, Register src) {
4216   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
4217 }
4218 
4219 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4220 void MacroAssembler::tlab_allocate(Register thread, Register obj,
4221                                    Register var_size_in_bytes,
4222                                    int con_size_in_bytes,
4223                                    Register t1,
4224                                    Register t2,
4225                                    Label& slow_case) {
4226   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4227   bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4228 }
4229 
4230 // Defines obj, preserves var_size_in_bytes
4231 void MacroAssembler::eden_allocate(Register thread, Register obj,
4232                                    Register var_size_in_bytes,
4233                                    int con_size_in_bytes,
4234                                    Register t1,
4235                                    Label& slow_case) {
4236   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4237   bs->eden_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4238 }
4239 
4240 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
4241 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
4242   assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
4243   assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
4244   Label done;
4245 
4246   testptr(length_in_bytes, length_in_bytes);
4247   jcc(Assembler::zero, done);
4248 
4249   // initialize topmost word, divide index by 2, check if odd and test if zero
4250   // note: for the remaining code to work, index must be a multiple of BytesPerWord
4251 #ifdef ASSERT
4252   {
4253     Label L;
4254     testptr(length_in_bytes, BytesPerWord - 1);
4255     jcc(Assembler::zero, L);
4256     stop("length must be a multiple of BytesPerWord");
4257     bind(L);
4258   }
4259 #endif
4260   Register index = length_in_bytes;
4261   xorptr(temp, temp);    // use _zero reg to clear memory (shorter code)
4262   if (UseIncDec) {
4263     shrptr(index, 3);  // divide by 8/16 and set carry flag if bit 2 was set
4264   } else {
4265     shrptr(index, 2);  // use 2 instructions to avoid partial flag stall
4266     shrptr(index, 1);
4267   }
4268 #ifndef _LP64
4269   // index could have not been a multiple of 8 (i.e., bit 2 was set)
4270   {
4271     Label even;
4272     // note: if index was a multiple of 8, then it cannot
4273     //       be 0 now otherwise it must have been 0 before
4274     //       => if it is even, we don't need to check for 0 again
4275     jcc(Assembler::carryClear, even);
4276     // clear topmost word (no jump would be needed if conditional assignment worked here)
4277     movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
4278     // index could be 0 now, must check again
4279     jcc(Assembler::zero, done);
4280     bind(even);
4281   }
4282 #endif // !_LP64
4283   // initialize remaining object fields: index is a multiple of 2 now
4284   {
4285     Label loop;
4286     bind(loop);
4287     movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
4288     NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
4289     decrement(index);
4290     jcc(Assembler::notZero, loop);
4291   }
4292 
4293   bind(done);
4294 }
4295 
4296 // Look up the method for a megamorphic invokeinterface call.
4297 // The target method is determined by <intf_klass, itable_index>.
4298 // The receiver klass is in recv_klass.
4299 // On success, the result will be in method_result, and execution falls through.
4300 // On failure, execution transfers to the given label.
4301 void MacroAssembler::lookup_interface_method(Register recv_klass,
4302                                              Register intf_klass,
4303                                              RegisterOrConstant itable_index,
4304                                              Register method_result,
4305                                              Register scan_temp,
4306                                              Label& L_no_such_interface,
4307                                              bool return_method) {
4308   assert_different_registers(recv_klass, intf_klass, scan_temp);
4309   assert_different_registers(method_result, intf_klass, scan_temp);
4310   assert(recv_klass != method_result || !return_method,
4311          "recv_klass can be destroyed when method isn't needed");
4312 
4313   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
4314          "caller must use same register for non-constant itable index as for method");
4315 
4316   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
4317   int vtable_base = in_bytes(Klass::vtable_start_offset());
4318   int itentry_off = itableMethodEntry::method_offset_in_bytes();
4319   int scan_step   = itableOffsetEntry::size() * wordSize;
4320   int vte_size    = vtableEntry::size_in_bytes();
4321   Address::ScaleFactor times_vte_scale = Address::times_ptr;
4322   assert(vte_size == wordSize, "else adjust times_vte_scale");
4323 
4324   movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
4325 
4326   // %%% Could store the aligned, prescaled offset in the klassoop.
4327   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
4328 
4329   if (return_method) {
4330     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
4331     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
4332     lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
4333   }
4334 
4335   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
4336   //   if (scan->interface() == intf) {
4337   //     result = (klass + scan->offset() + itable_index);
4338   //   }
4339   // }
4340   Label search, found_method;
4341 
4342   for (int peel = 1; peel >= 0; peel--) {
4343     movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
4344     cmpptr(intf_klass, method_result);
4345 
4346     if (peel) {
4347       jccb(Assembler::equal, found_method);
4348     } else {
4349       jccb(Assembler::notEqual, search);
4350       // (invert the test to fall through to found_method...)
4351     }
4352 
4353     if (!peel)  break;
4354 
4355     bind(search);
4356 
4357     // Check that the previous entry is non-null.  A null entry means that
4358     // the receiver class doesn't implement the interface, and wasn't the
4359     // same as when the caller was compiled.
4360     testptr(method_result, method_result);
4361     jcc(Assembler::zero, L_no_such_interface);
4362     addptr(scan_temp, scan_step);
4363   }
4364 
4365   bind(found_method);
4366 
4367   if (return_method) {
4368     // Got a hit.
4369     movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
4370     movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
4371   }
4372 }
4373 
4374 
4375 // virtual method calling
4376 void MacroAssembler::lookup_virtual_method(Register recv_klass,
4377                                            RegisterOrConstant vtable_index,
4378                                            Register method_result) {
4379   const int base = in_bytes(Klass::vtable_start_offset());
4380   assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
4381   Address vtable_entry_addr(recv_klass,
4382                             vtable_index, Address::times_ptr,
4383                             base + vtableEntry::method_offset_in_bytes());
4384   movptr(method_result, vtable_entry_addr);
4385 }
4386 
4387 
4388 void MacroAssembler::check_klass_subtype(Register sub_klass,
4389                            Register super_klass,
4390                            Register temp_reg,
4391                            Label& L_success) {
4392   Label L_failure;
4393   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
4394   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
4395   bind(L_failure);
4396 }
4397 
4398 
4399 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4400                                                    Register super_klass,
4401                                                    Register temp_reg,
4402                                                    Label* L_success,
4403                                                    Label* L_failure,
4404                                                    Label* L_slow_path,
4405                                         RegisterOrConstant super_check_offset) {
4406   assert_different_registers(sub_klass, super_klass, temp_reg);
4407   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
4408   if (super_check_offset.is_register()) {
4409     assert_different_registers(sub_klass, super_klass,
4410                                super_check_offset.as_register());
4411   } else if (must_load_sco) {
4412     assert(temp_reg != noreg, "supply either a temp or a register offset");
4413   }
4414 
4415   Label L_fallthrough;
4416   int label_nulls = 0;
4417   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
4418   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
4419   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
4420   assert(label_nulls <= 1, "at most one NULL in the batch");
4421 
4422   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4423   int sco_offset = in_bytes(Klass::super_check_offset_offset());
4424   Address super_check_offset_addr(super_klass, sco_offset);
4425 
4426   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
4427   // range of a jccb.  If this routine grows larger, reconsider at
4428   // least some of these.
4429 #define local_jcc(assembler_cond, label)                                \
4430   if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
4431   else                             jcc( assembler_cond, label) /*omit semi*/
4432 
4433   // Hacked jmp, which may only be used just before L_fallthrough.
4434 #define final_jmp(label)                                                \
4435   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
4436   else                            jmp(label)                /*omit semi*/
4437 
4438   // If the pointers are equal, we are done (e.g., String[] elements).
4439   // This self-check enables sharing of secondary supertype arrays among
4440   // non-primary types such as array-of-interface.  Otherwise, each such
4441   // type would need its own customized SSA.
4442   // We move this check to the front of the fast path because many
4443   // type checks are in fact trivially successful in this manner,
4444   // so we get a nicely predicted branch right at the start of the check.
4445   cmpptr(sub_klass, super_klass);
4446   local_jcc(Assembler::equal, *L_success);
4447 
4448   // Check the supertype display:
4449   if (must_load_sco) {
4450     // Positive movl does right thing on LP64.
4451     movl(temp_reg, super_check_offset_addr);
4452     super_check_offset = RegisterOrConstant(temp_reg);
4453   }
4454   Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
4455   cmpptr(super_klass, super_check_addr); // load displayed supertype
4456 
4457   // This check has worked decisively for primary supers.
4458   // Secondary supers are sought in the super_cache ('super_cache_addr').
4459   // (Secondary supers are interfaces and very deeply nested subtypes.)
4460   // This works in the same check above because of a tricky aliasing
4461   // between the super_cache and the primary super display elements.
4462   // (The 'super_check_addr' can address either, as the case requires.)
4463   // Note that the cache is updated below if it does not help us find
4464   // what we need immediately.
4465   // So if it was a primary super, we can just fail immediately.
4466   // Otherwise, it's the slow path for us (no success at this point).
4467 
4468   if (super_check_offset.is_register()) {
4469     local_jcc(Assembler::equal, *L_success);
4470     cmpl(super_check_offset.as_register(), sc_offset);
4471     if (L_failure == &L_fallthrough) {
4472       local_jcc(Assembler::equal, *L_slow_path);
4473     } else {
4474       local_jcc(Assembler::notEqual, *L_failure);
4475       final_jmp(*L_slow_path);
4476     }
4477   } else if (super_check_offset.as_constant() == sc_offset) {
4478     // Need a slow path; fast failure is impossible.
4479     if (L_slow_path == &L_fallthrough) {
4480       local_jcc(Assembler::equal, *L_success);
4481     } else {
4482       local_jcc(Assembler::notEqual, *L_slow_path);
4483       final_jmp(*L_success);
4484     }
4485   } else {
4486     // No slow path; it's a fast decision.
4487     if (L_failure == &L_fallthrough) {
4488       local_jcc(Assembler::equal, *L_success);
4489     } else {
4490       local_jcc(Assembler::notEqual, *L_failure);
4491       final_jmp(*L_success);
4492     }
4493   }
4494 
4495   bind(L_fallthrough);
4496 
4497 #undef local_jcc
4498 #undef final_jmp
4499 }
4500 
4501 
4502 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4503                                                    Register super_klass,
4504                                                    Register temp_reg,
4505                                                    Register temp2_reg,
4506                                                    Label* L_success,
4507                                                    Label* L_failure,
4508                                                    bool set_cond_codes) {
4509   assert_different_registers(sub_klass, super_klass, temp_reg);
4510   if (temp2_reg != noreg)
4511     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
4512 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
4513 
4514   Label L_fallthrough;
4515   int label_nulls = 0;
4516   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
4517   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
4518   assert(label_nulls <= 1, "at most one NULL in the batch");
4519 
4520   // a couple of useful fields in sub_klass:
4521   int ss_offset = in_bytes(Klass::secondary_supers_offset());
4522   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4523   Address secondary_supers_addr(sub_klass, ss_offset);
4524   Address super_cache_addr(     sub_klass, sc_offset);
4525 
4526   // Do a linear scan of the secondary super-klass chain.
4527   // This code is rarely used, so simplicity is a virtue here.
4528   // The repne_scan instruction uses fixed registers, which we must spill.
4529   // Don't worry too much about pre-existing connections with the input regs.
4530 
4531   assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
4532   assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
4533 
4534   // Get super_klass value into rax (even if it was in rdi or rcx).
4535   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
4536   if (super_klass != rax || UseCompressedOops) {
4537     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
4538     mov(rax, super_klass);
4539   }
4540   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
4541   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
4542 
4543 #ifndef PRODUCT
4544   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
4545   ExternalAddress pst_counter_addr((address) pst_counter);
4546   NOT_LP64(  incrementl(pst_counter_addr) );
4547   LP64_ONLY( lea(rcx, pst_counter_addr) );
4548   LP64_ONLY( incrementl(Address(rcx, 0)) );
4549 #endif //PRODUCT
4550 
4551   // We will consult the secondary-super array.
4552   movptr(rdi, secondary_supers_addr);
4553   // Load the array length.  (Positive movl does right thing on LP64.)
4554   movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
4555   // Skip to start of data.
4556   addptr(rdi, Array<Klass*>::base_offset_in_bytes());
4557 
4558   // Scan RCX words at [RDI] for an occurrence of RAX.
4559   // Set NZ/Z based on last compare.
4560   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
4561   // not change flags (only scas instruction which is repeated sets flags).
4562   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
4563 
4564     testptr(rax,rax); // Set Z = 0
4565     repne_scan();
4566 
4567   // Unspill the temp. registers:
4568   if (pushed_rdi)  pop(rdi);
4569   if (pushed_rcx)  pop(rcx);
4570   if (pushed_rax)  pop(rax);
4571 
4572   if (set_cond_codes) {
4573     // Special hack for the AD files:  rdi is guaranteed non-zero.
4574     assert(!pushed_rdi, "rdi must be left non-NULL");
4575     // Also, the condition codes are properly set Z/NZ on succeed/failure.
4576   }
4577 
4578   if (L_failure == &L_fallthrough)
4579         jccb(Assembler::notEqual, *L_failure);
4580   else  jcc(Assembler::notEqual, *L_failure);
4581 
4582   // Success.  Cache the super we found and proceed in triumph.
4583   movptr(super_cache_addr, super_klass);
4584 
4585   if (L_success != &L_fallthrough) {
4586     jmp(*L_success);
4587   }
4588 
4589 #undef IS_A_TEMP
4590 
4591   bind(L_fallthrough);
4592 }
4593 
4594 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
4595   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
4596 
4597   Label L_fallthrough;
4598   if (L_fast_path == NULL) {
4599     L_fast_path = &L_fallthrough;
4600   } else if (L_slow_path == NULL) {
4601     L_slow_path = &L_fallthrough;
4602   }
4603 
4604   // Fast path check: class is fully initialized
4605   cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
4606   jcc(Assembler::equal, *L_fast_path);
4607 
4608   // Fast path check: current thread is initializer thread
4609   cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset()));
4610   if (L_slow_path == &L_fallthrough) {
4611     jcc(Assembler::equal, *L_fast_path);
4612     bind(*L_slow_path);
4613   } else if (L_fast_path == &L_fallthrough) {
4614     jcc(Assembler::notEqual, *L_slow_path);
4615     bind(*L_fast_path);
4616   } else {
4617     Unimplemented();
4618   }
4619 }
4620 
4621 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
4622   if (VM_Version::supports_cmov()) {
4623     cmovl(cc, dst, src);
4624   } else {
4625     Label L;
4626     jccb(negate_condition(cc), L);
4627     movl(dst, src);
4628     bind(L);
4629   }
4630 }
4631 
4632 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
4633   if (VM_Version::supports_cmov()) {
4634     cmovl(cc, dst, src);
4635   } else {
4636     Label L;
4637     jccb(negate_condition(cc), L);
4638     movl(dst, src);
4639     bind(L);
4640   }
4641 }
4642 
4643 void MacroAssembler::verify_oop(Register reg, const char* s) {
4644   if (!VerifyOops) return;
4645 
4646   // Pass register number to verify_oop_subroutine
4647   const char* b = NULL;
4648   {
4649     ResourceMark rm;
4650     stringStream ss;
4651     ss.print("verify_oop: %s: %s", reg->name(), s);
4652     b = code_string(ss.as_string());
4653   }
4654   BLOCK_COMMENT("verify_oop {");
4655 #ifdef _LP64
4656   push(rscratch1);                    // save r10, trashed by movptr()
4657 #endif
4658   push(rax);                          // save rax,
4659   push(reg);                          // pass register argument
4660   ExternalAddress buffer((address) b);
4661   // avoid using pushptr, as it modifies scratch registers
4662   // and our contract is not to modify anything
4663   movptr(rax, buffer.addr());
4664   push(rax);
4665   // call indirectly to solve generation ordering problem
4666   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4667   call(rax);
4668   // Caller pops the arguments (oop, message) and restores rax, r10
4669   BLOCK_COMMENT("} verify_oop");
4670 }
4671 
4672 
4673 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
4674                                                       Register tmp,
4675                                                       int offset) {
4676   intptr_t value = *delayed_value_addr;
4677   if (value != 0)
4678     return RegisterOrConstant(value + offset);
4679 
4680   // load indirectly to solve generation ordering problem
4681   movptr(tmp, ExternalAddress((address) delayed_value_addr));
4682 
4683 #ifdef ASSERT
4684   { Label L;
4685     testptr(tmp, tmp);
4686     if (WizardMode) {
4687       const char* buf = NULL;
4688       {
4689         ResourceMark rm;
4690         stringStream ss;
4691         ss.print("DelayedValue=" INTPTR_FORMAT, delayed_value_addr[1]);
4692         buf = code_string(ss.as_string());
4693       }
4694       jcc(Assembler::notZero, L);
4695       STOP(buf);
4696     } else {
4697       jccb(Assembler::notZero, L);
4698       hlt();
4699     }
4700     bind(L);
4701   }
4702 #endif
4703 
4704   if (offset != 0)
4705     addptr(tmp, offset);
4706 
4707   return RegisterOrConstant(tmp);
4708 }
4709 
4710 
4711 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
4712                                          int extra_slot_offset) {
4713   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
4714   int stackElementSize = Interpreter::stackElementSize;
4715   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
4716 #ifdef ASSERT
4717   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
4718   assert(offset1 - offset == stackElementSize, "correct arithmetic");
4719 #endif
4720   Register             scale_reg    = noreg;
4721   Address::ScaleFactor scale_factor = Address::no_scale;
4722   if (arg_slot.is_constant()) {
4723     offset += arg_slot.as_constant() * stackElementSize;
4724   } else {
4725     scale_reg    = arg_slot.as_register();
4726     scale_factor = Address::times(stackElementSize);
4727   }
4728   offset += wordSize;           // return PC is on stack
4729   return Address(rsp, scale_reg, scale_factor, offset);
4730 }
4731 
4732 
4733 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
4734   if (!VerifyOops) return;
4735 
4736   // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
4737   // Pass register number to verify_oop_subroutine
4738   const char* b = NULL;
4739   {
4740     ResourceMark rm;
4741     stringStream ss;
4742     ss.print("verify_oop_addr: %s", s);
4743     b = code_string(ss.as_string());
4744   }
4745 #ifdef _LP64
4746   push(rscratch1);                    // save r10, trashed by movptr()
4747 #endif
4748   push(rax);                          // save rax,
4749   // addr may contain rsp so we will have to adjust it based on the push
4750   // we just did (and on 64 bit we do two pushes)
4751   // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
4752   // stores rax into addr which is backwards of what was intended.
4753   if (addr.uses(rsp)) {
4754     lea(rax, addr);
4755     pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
4756   } else {
4757     pushptr(addr);
4758   }
4759 
4760   ExternalAddress buffer((address) b);
4761   // pass msg argument
4762   // avoid using pushptr, as it modifies scratch registers
4763   // and our contract is not to modify anything
4764   movptr(rax, buffer.addr());
4765   push(rax);
4766 
4767   // call indirectly to solve generation ordering problem
4768   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4769   call(rax);
4770   // Caller pops the arguments (addr, message) and restores rax, r10.
4771 }
4772 
4773 void MacroAssembler::verify_tlab() {
4774 #ifdef ASSERT
4775   if (UseTLAB && VerifyOops) {
4776     Label next, ok;
4777     Register t1 = rsi;
4778     Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
4779 
4780     push(t1);
4781     NOT_LP64(push(thread_reg));
4782     NOT_LP64(get_thread(thread_reg));
4783 
4784     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4785     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
4786     jcc(Assembler::aboveEqual, next);
4787     STOP("assert(top >= start)");
4788     should_not_reach_here();
4789 
4790     bind(next);
4791     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
4792     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4793     jcc(Assembler::aboveEqual, ok);
4794     STOP("assert(top <= end)");
4795     should_not_reach_here();
4796 
4797     bind(ok);
4798     NOT_LP64(pop(thread_reg));
4799     pop(t1);
4800   }
4801 #endif
4802 }
4803 
4804 class ControlWord {
4805  public:
4806   int32_t _value;
4807 
4808   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
4809   int  precision_control() const       { return  (_value >>  8) & 3      ; }
4810   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4811   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4812   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4813   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4814   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4815   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4816 
4817   void print() const {
4818     // rounding control
4819     const char* rc;
4820     switch (rounding_control()) {
4821       case 0: rc = "round near"; break;
4822       case 1: rc = "round down"; break;
4823       case 2: rc = "round up  "; break;
4824       case 3: rc = "chop      "; break;
4825     };
4826     // precision control
4827     const char* pc;
4828     switch (precision_control()) {
4829       case 0: pc = "24 bits "; break;
4830       case 1: pc = "reserved"; break;
4831       case 2: pc = "53 bits "; break;
4832       case 3: pc = "64 bits "; break;
4833     };
4834     // flags
4835     char f[9];
4836     f[0] = ' ';
4837     f[1] = ' ';
4838     f[2] = (precision   ()) ? 'P' : 'p';
4839     f[3] = (underflow   ()) ? 'U' : 'u';
4840     f[4] = (overflow    ()) ? 'O' : 'o';
4841     f[5] = (zero_divide ()) ? 'Z' : 'z';
4842     f[6] = (denormalized()) ? 'D' : 'd';
4843     f[7] = (invalid     ()) ? 'I' : 'i';
4844     f[8] = '\x0';
4845     // output
4846     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
4847   }
4848 
4849 };
4850 
4851 class StatusWord {
4852  public:
4853   int32_t _value;
4854 
4855   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
4856   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
4857   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
4858   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
4859   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
4860   int  top() const                     { return  (_value >> 11) & 7      ; }
4861   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
4862   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
4863   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4864   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4865   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4866   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4867   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4868   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4869 
4870   void print() const {
4871     // condition codes
4872     char c[5];
4873     c[0] = (C3()) ? '3' : '-';
4874     c[1] = (C2()) ? '2' : '-';
4875     c[2] = (C1()) ? '1' : '-';
4876     c[3] = (C0()) ? '0' : '-';
4877     c[4] = '\x0';
4878     // flags
4879     char f[9];
4880     f[0] = (error_status()) ? 'E' : '-';
4881     f[1] = (stack_fault ()) ? 'S' : '-';
4882     f[2] = (precision   ()) ? 'P' : '-';
4883     f[3] = (underflow   ()) ? 'U' : '-';
4884     f[4] = (overflow    ()) ? 'O' : '-';
4885     f[5] = (zero_divide ()) ? 'Z' : '-';
4886     f[6] = (denormalized()) ? 'D' : '-';
4887     f[7] = (invalid     ()) ? 'I' : '-';
4888     f[8] = '\x0';
4889     // output
4890     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
4891   }
4892 
4893 };
4894 
4895 class TagWord {
4896  public:
4897   int32_t _value;
4898 
4899   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
4900 
4901   void print() const {
4902     printf("%04x", _value & 0xFFFF);
4903   }
4904 
4905 };
4906 
4907 class FPU_Register {
4908  public:
4909   int32_t _m0;
4910   int32_t _m1;
4911   int16_t _ex;
4912 
4913   bool is_indefinite() const           {
4914     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
4915   }
4916 
4917   void print() const {
4918     char  sign = (_ex < 0) ? '-' : '+';
4919     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
4920     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
4921   };
4922 
4923 };
4924 
4925 class FPU_State {
4926  public:
4927   enum {
4928     register_size       = 10,
4929     number_of_registers =  8,
4930     register_mask       =  7
4931   };
4932 
4933   ControlWord  _control_word;
4934   StatusWord   _status_word;
4935   TagWord      _tag_word;
4936   int32_t      _error_offset;
4937   int32_t      _error_selector;
4938   int32_t      _data_offset;
4939   int32_t      _data_selector;
4940   int8_t       _register[register_size * number_of_registers];
4941 
4942   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
4943   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
4944 
4945   const char* tag_as_string(int tag) const {
4946     switch (tag) {
4947       case 0: return "valid";
4948       case 1: return "zero";
4949       case 2: return "special";
4950       case 3: return "empty";
4951     }
4952     ShouldNotReachHere();
4953     return NULL;
4954   }
4955 
4956   void print() const {
4957     // print computation registers
4958     { int t = _status_word.top();
4959       for (int i = 0; i < number_of_registers; i++) {
4960         int j = (i - t) & register_mask;
4961         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
4962         st(j)->print();
4963         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
4964       }
4965     }
4966     printf("\n");
4967     // print control registers
4968     printf("ctrl = "); _control_word.print(); printf("\n");
4969     printf("stat = "); _status_word .print(); printf("\n");
4970     printf("tags = "); _tag_word    .print(); printf("\n");
4971   }
4972 
4973 };
4974 
4975 class Flag_Register {
4976  public:
4977   int32_t _value;
4978 
4979   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
4980   bool direction() const               { return ((_value >> 10) & 1) != 0; }
4981   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
4982   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
4983   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
4984   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
4985   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
4986 
4987   void print() const {
4988     // flags
4989     char f[8];
4990     f[0] = (overflow       ()) ? 'O' : '-';
4991     f[1] = (direction      ()) ? 'D' : '-';
4992     f[2] = (sign           ()) ? 'S' : '-';
4993     f[3] = (zero           ()) ? 'Z' : '-';
4994     f[4] = (auxiliary_carry()) ? 'A' : '-';
4995     f[5] = (parity         ()) ? 'P' : '-';
4996     f[6] = (carry          ()) ? 'C' : '-';
4997     f[7] = '\x0';
4998     // output
4999     printf("%08x  flags = %s", _value, f);
5000   }
5001 
5002 };
5003 
5004 class IU_Register {
5005  public:
5006   int32_t _value;
5007 
5008   void print() const {
5009     printf("%08x  %11d", _value, _value);
5010   }
5011 
5012 };
5013 
5014 class IU_State {
5015  public:
5016   Flag_Register _eflags;
5017   IU_Register   _rdi;
5018   IU_Register   _rsi;
5019   IU_Register   _rbp;
5020   IU_Register   _rsp;
5021   IU_Register   _rbx;
5022   IU_Register   _rdx;
5023   IU_Register   _rcx;
5024   IU_Register   _rax;
5025 
5026   void print() const {
5027     // computation registers
5028     printf("rax,  = "); _rax.print(); printf("\n");
5029     printf("rbx,  = "); _rbx.print(); printf("\n");
5030     printf("rcx  = "); _rcx.print(); printf("\n");
5031     printf("rdx  = "); _rdx.print(); printf("\n");
5032     printf("rdi  = "); _rdi.print(); printf("\n");
5033     printf("rsi  = "); _rsi.print(); printf("\n");
5034     printf("rbp,  = "); _rbp.print(); printf("\n");
5035     printf("rsp  = "); _rsp.print(); printf("\n");
5036     printf("\n");
5037     // control registers
5038     printf("flgs = "); _eflags.print(); printf("\n");
5039   }
5040 };
5041 
5042 
5043 class CPU_State {
5044  public:
5045   FPU_State _fpu_state;
5046   IU_State  _iu_state;
5047 
5048   void print() const {
5049     printf("--------------------------------------------------\n");
5050     _iu_state .print();
5051     printf("\n");
5052     _fpu_state.print();
5053     printf("--------------------------------------------------\n");
5054   }
5055 
5056 };
5057 
5058 
5059 static void _print_CPU_state(CPU_State* state) {
5060   state->print();
5061 };
5062 
5063 
5064 void MacroAssembler::print_CPU_state() {
5065   push_CPU_state();
5066   push(rsp);                // pass CPU state
5067   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
5068   addptr(rsp, wordSize);       // discard argument
5069   pop_CPU_state();
5070 }
5071 
5072 
5073 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
5074   static int counter = 0;
5075   FPU_State* fs = &state->_fpu_state;
5076   counter++;
5077   // For leaf calls, only verify that the top few elements remain empty.
5078   // We only need 1 empty at the top for C2 code.
5079   if( stack_depth < 0 ) {
5080     if( fs->tag_for_st(7) != 3 ) {
5081       printf("FPR7 not empty\n");
5082       state->print();
5083       assert(false, "error");
5084       return false;
5085     }
5086     return true;                // All other stack states do not matter
5087   }
5088 
5089   assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
5090          "bad FPU control word");
5091 
5092   // compute stack depth
5093   int i = 0;
5094   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
5095   int d = i;
5096   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
5097   // verify findings
5098   if (i != FPU_State::number_of_registers) {
5099     // stack not contiguous
5100     printf("%s: stack not contiguous at ST%d\n", s, i);
5101     state->print();
5102     assert(false, "error");
5103     return false;
5104   }
5105   // check if computed stack depth corresponds to expected stack depth
5106   if (stack_depth < 0) {
5107     // expected stack depth is -stack_depth or less
5108     if (d > -stack_depth) {
5109       // too many elements on the stack
5110       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
5111       state->print();
5112       assert(false, "error");
5113       return false;
5114     }
5115   } else {
5116     // expected stack depth is stack_depth
5117     if (d != stack_depth) {
5118       // wrong stack depth
5119       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
5120       state->print();
5121       assert(false, "error");
5122       return false;
5123     }
5124   }
5125   // everything is cool
5126   return true;
5127 }
5128 
5129 
5130 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
5131   if (!VerifyFPU) return;
5132   push_CPU_state();
5133   push(rsp);                // pass CPU state
5134   ExternalAddress msg((address) s);
5135   // pass message string s
5136   pushptr(msg.addr());
5137   push(stack_depth);        // pass stack depth
5138   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
5139   addptr(rsp, 3 * wordSize);   // discard arguments
5140   // check for error
5141   { Label L;
5142     testl(rax, rax);
5143     jcc(Assembler::notZero, L);
5144     int3();                  // break if error condition
5145     bind(L);
5146   }
5147   pop_CPU_state();
5148 }
5149 
5150 void MacroAssembler::restore_cpu_control_state_after_jni() {
5151   // Either restore the MXCSR register after returning from the JNI Call
5152   // or verify that it wasn't changed (with -Xcheck:jni flag).
5153   if (VM_Version::supports_sse()) {
5154     if (RestoreMXCSROnJNICalls) {
5155       ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
5156     } else if (CheckJNICalls) {
5157       call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
5158     }
5159   }
5160   // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
5161   vzeroupper();
5162   // Reset k1 to 0xffff.
5163 
5164 #ifdef COMPILER2
5165   if (PostLoopMultiversioning && VM_Version::supports_evex()) {
5166     push(rcx);
5167     movl(rcx, 0xffff);
5168     kmovwl(k1, rcx);
5169     pop(rcx);
5170   }
5171 #endif // COMPILER2
5172 
5173 #ifndef _LP64
5174   // Either restore the x87 floating pointer control word after returning
5175   // from the JNI call or verify that it wasn't changed.
5176   if (CheckJNICalls) {
5177     call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
5178   }
5179 #endif // _LP64
5180 }
5181 
5182 // ((OopHandle)result).resolve();
5183 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
5184   assert_different_registers(result, tmp);
5185 
5186   // Only 64 bit platforms support GCs that require a tmp register
5187   // Only IN_HEAP loads require a thread_tmp register
5188   // OopHandle::resolve is an indirection like jobject.
5189   access_load_at(T_OBJECT, IN_NATIVE,
5190                  result, Address(result, 0), tmp, /*tmp_thread*/noreg);
5191 }
5192 
5193 // ((WeakHandle)result).resolve();
5194 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
5195   assert_different_registers(rresult, rtmp);
5196   Label resolved;
5197 
5198   // A null weak handle resolves to null.
5199   cmpptr(rresult, 0);
5200   jcc(Assembler::equal, resolved);
5201 
5202   // Only 64 bit platforms support GCs that require a tmp register
5203   // Only IN_HEAP loads require a thread_tmp register
5204   // WeakHandle::resolve is an indirection like jweak.
5205   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
5206                  rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg);
5207   bind(resolved);
5208 }
5209 
5210 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
5211   // get mirror
5212   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
5213   load_method_holder(mirror, method);
5214   movptr(mirror, Address(mirror, mirror_offset));
5215   resolve_oop_handle(mirror, tmp);
5216 }
5217 
5218 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
5219   load_method_holder(rresult, rmethod);
5220   movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
5221 }
5222 
5223 void MacroAssembler::load_method_holder(Register holder, Register method) {
5224   movptr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
5225   movptr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
5226   movptr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
5227 }
5228 
5229 void MacroAssembler::load_klass(Register dst, Register src) {
5230 #ifdef _LP64
5231   if (UseCompressedClassPointers) {
5232     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5233     decode_klass_not_null(dst);
5234   } else
5235 #endif
5236     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5237 }
5238 
5239 void MacroAssembler::load_prototype_header(Register dst, Register src) {
5240   load_klass(dst, src);
5241   movptr(dst, Address(dst, Klass::prototype_header_offset()));
5242 }
5243 
5244 void MacroAssembler::store_klass(Register dst, Register src) {
5245 #ifdef _LP64
5246   if (UseCompressedClassPointers) {
5247     encode_klass_not_null(src);
5248     movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5249   } else
5250 #endif
5251     movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5252 }
5253 
5254 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
5255                                     Register tmp1, Register thread_tmp) {
5256   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5257   decorators = AccessInternal::decorator_fixup(decorators);
5258   bool as_raw = (decorators & AS_RAW) != 0;
5259   if (as_raw) {
5260     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
5261   } else {
5262     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
5263   }
5264 }
5265 
5266 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src,
5267                                      Register tmp1, Register tmp2) {
5268   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5269   decorators = AccessInternal::decorator_fixup(decorators);
5270   bool as_raw = (decorators & AS_RAW) != 0;
5271   if (as_raw) {
5272     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, tmp2);
5273   } else {
5274     bs->store_at(this, decorators, type, dst, src, tmp1, tmp2);
5275   }
5276 }
5277 
5278 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
5279   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
5280   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
5281     decorators |= ACCESS_READ | ACCESS_WRITE;
5282   }
5283   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5284   return bs->resolve(this, decorators, obj);
5285 }
5286 
5287 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
5288                                    Register thread_tmp, DecoratorSet decorators) {
5289   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
5290 }
5291 
5292 // Doesn't do verfication, generates fixed size code
5293 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
5294                                             Register thread_tmp, DecoratorSet decorators) {
5295   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
5296 }
5297 
5298 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
5299                                     Register tmp2, DecoratorSet decorators) {
5300   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
5301 }
5302 
5303 // Used for storing NULLs.
5304 void MacroAssembler::store_heap_oop_null(Address dst) {
5305   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
5306 }
5307 
5308 #ifdef _LP64
5309 void MacroAssembler::store_klass_gap(Register dst, Register src) {
5310   if (UseCompressedClassPointers) {
5311     // Store to klass gap in destination
5312     movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
5313   }
5314 }
5315 
5316 #ifdef ASSERT
5317 void MacroAssembler::verify_heapbase(const char* msg) {
5318   assert (UseCompressedOops, "should be compressed");
5319   assert (Universe::heap() != NULL, "java heap should be initialized");
5320   if (CheckCompressedOops) {
5321     Label ok;
5322     push(rscratch1); // cmpptr trashes rscratch1
5323     cmpptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
5324     jcc(Assembler::equal, ok);
5325     STOP(msg);
5326     bind(ok);
5327     pop(rscratch1);
5328   }
5329 }
5330 #endif
5331 
5332 // Algorithm must match oop.inline.hpp encode_heap_oop.
5333 void MacroAssembler::encode_heap_oop(Register r) {
5334 #ifdef ASSERT
5335   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
5336 #endif
5337   verify_oop(r, "broken oop in encode_heap_oop");
5338   if (CompressedOops::base() == NULL) {
5339     if (CompressedOops::shift() != 0) {
5340       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5341       shrq(r, LogMinObjAlignmentInBytes);
5342     }
5343     return;
5344   }
5345   testq(r, r);
5346   cmovq(Assembler::equal, r, r12_heapbase);
5347   subq(r, r12_heapbase);
5348   shrq(r, LogMinObjAlignmentInBytes);
5349 }
5350 
5351 void MacroAssembler::encode_heap_oop_not_null(Register r) {
5352 #ifdef ASSERT
5353   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
5354   if (CheckCompressedOops) {
5355     Label ok;
5356     testq(r, r);
5357     jcc(Assembler::notEqual, ok);
5358     STOP("null oop passed to encode_heap_oop_not_null");
5359     bind(ok);
5360   }
5361 #endif
5362   verify_oop(r, "broken oop in encode_heap_oop_not_null");
5363   if (CompressedOops::base() != NULL) {
5364     subq(r, r12_heapbase);
5365   }
5366   if (CompressedOops::shift() != 0) {
5367     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5368     shrq(r, LogMinObjAlignmentInBytes);
5369   }
5370 }
5371 
5372 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
5373 #ifdef ASSERT
5374   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
5375   if (CheckCompressedOops) {
5376     Label ok;
5377     testq(src, src);
5378     jcc(Assembler::notEqual, ok);
5379     STOP("null oop passed to encode_heap_oop_not_null2");
5380     bind(ok);
5381   }
5382 #endif
5383   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
5384   if (dst != src) {
5385     movq(dst, src);
5386   }
5387   if (CompressedOops::base() != NULL) {
5388     subq(dst, r12_heapbase);
5389   }
5390   if (CompressedOops::shift() != 0) {
5391     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5392     shrq(dst, LogMinObjAlignmentInBytes);
5393   }
5394 }
5395 
5396 void  MacroAssembler::decode_heap_oop(Register r) {
5397 #ifdef ASSERT
5398   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
5399 #endif
5400   if (CompressedOops::base() == NULL) {
5401     if (CompressedOops::shift() != 0) {
5402       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5403       shlq(r, LogMinObjAlignmentInBytes);
5404     }
5405   } else {
5406     Label done;
5407     shlq(r, LogMinObjAlignmentInBytes);
5408     jccb(Assembler::equal, done);
5409     addq(r, r12_heapbase);
5410     bind(done);
5411   }
5412   verify_oop(r, "broken oop in decode_heap_oop");
5413 }
5414 
5415 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
5416   // Note: it will change flags
5417   assert (UseCompressedOops, "should only be used for compressed headers");
5418   assert (Universe::heap() != NULL, "java heap should be initialized");
5419   // Cannot assert, unverified entry point counts instructions (see .ad file)
5420   // vtableStubs also counts instructions in pd_code_size_limit.
5421   // Also do not verify_oop as this is called by verify_oop.
5422   if (CompressedOops::shift() != 0) {
5423     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5424     shlq(r, LogMinObjAlignmentInBytes);
5425     if (CompressedOops::base() != NULL) {
5426       addq(r, r12_heapbase);
5427     }
5428   } else {
5429     assert (CompressedOops::base() == NULL, "sanity");
5430   }
5431 }
5432 
5433 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
5434   // Note: it will change flags
5435   assert (UseCompressedOops, "should only be used for compressed headers");
5436   assert (Universe::heap() != NULL, "java heap should be initialized");
5437   // Cannot assert, unverified entry point counts instructions (see .ad file)
5438   // vtableStubs also counts instructions in pd_code_size_limit.
5439   // Also do not verify_oop as this is called by verify_oop.
5440   if (CompressedOops::shift() != 0) {
5441     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5442     if (LogMinObjAlignmentInBytes == Address::times_8) {
5443       leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
5444     } else {
5445       if (dst != src) {
5446         movq(dst, src);
5447       }
5448       shlq(dst, LogMinObjAlignmentInBytes);
5449       if (CompressedOops::base() != NULL) {
5450         addq(dst, r12_heapbase);
5451       }
5452     }
5453   } else {
5454     assert (CompressedOops::base() == NULL, "sanity");
5455     if (dst != src) {
5456       movq(dst, src);
5457     }
5458   }
5459 }
5460 
5461 void MacroAssembler::encode_klass_not_null(Register r) {
5462   if (CompressedKlassPointers::base() != NULL) {
5463     // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
5464     assert(r != r12_heapbase, "Encoding a klass in r12");
5465     mov64(r12_heapbase, (int64_t)CompressedKlassPointers::base());
5466     subq(r, r12_heapbase);
5467   }
5468   if (CompressedKlassPointers::shift() != 0) {
5469     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5470     shrq(r, LogKlassAlignmentInBytes);
5471   }
5472   if (CompressedKlassPointers::base() != NULL) {
5473     reinit_heapbase();
5474   }
5475 }
5476 
5477 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
5478   if (dst == src) {
5479     encode_klass_not_null(src);
5480   } else {
5481     if (CompressedKlassPointers::base() != NULL) {
5482       mov64(dst, (int64_t)CompressedKlassPointers::base());
5483       negq(dst);
5484       addq(dst, src);
5485     } else {
5486       movptr(dst, src);
5487     }
5488     if (CompressedKlassPointers::shift() != 0) {
5489       assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5490       shrq(dst, LogKlassAlignmentInBytes);
5491     }
5492   }
5493 }
5494 
5495 // Function instr_size_for_decode_klass_not_null() counts the instructions
5496 // generated by decode_klass_not_null(register r) and reinit_heapbase(),
5497 // when (Universe::heap() != NULL).  Hence, if the instructions they
5498 // generate change, then this method needs to be updated.
5499 int MacroAssembler::instr_size_for_decode_klass_not_null() {
5500   assert (UseCompressedClassPointers, "only for compressed klass ptrs");
5501   if (CompressedKlassPointers::base() != NULL) {
5502     // mov64 + addq + shlq? + mov64  (for reinit_heapbase()).
5503     return (CompressedKlassPointers::shift() == 0 ? 20 : 24);
5504   } else {
5505     // longest load decode klass function, mov64, leaq
5506     return 16;
5507   }
5508 }
5509 
5510 // !!! If the instructions that get generated here change then function
5511 // instr_size_for_decode_klass_not_null() needs to get updated.
5512 void  MacroAssembler::decode_klass_not_null(Register r) {
5513   // Note: it will change flags
5514   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5515   assert(r != r12_heapbase, "Decoding a klass in r12");
5516   // Cannot assert, unverified entry point counts instructions (see .ad file)
5517   // vtableStubs also counts instructions in pd_code_size_limit.
5518   // Also do not verify_oop as this is called by verify_oop.
5519   if (CompressedKlassPointers::shift() != 0) {
5520     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5521     shlq(r, LogKlassAlignmentInBytes);
5522   }
5523   // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
5524   if (CompressedKlassPointers::base() != NULL) {
5525     mov64(r12_heapbase, (int64_t)CompressedKlassPointers::base());
5526     addq(r, r12_heapbase);
5527     reinit_heapbase();
5528   }
5529 }
5530 
5531 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
5532   // Note: it will change flags
5533   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5534   if (dst == src) {
5535     decode_klass_not_null(dst);
5536   } else {
5537     // Cannot assert, unverified entry point counts instructions (see .ad file)
5538     // vtableStubs also counts instructions in pd_code_size_limit.
5539     // Also do not verify_oop as this is called by verify_oop.
5540     mov64(dst, (int64_t)CompressedKlassPointers::base());
5541     if (CompressedKlassPointers::shift() != 0) {
5542       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5543       assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
5544       leaq(dst, Address(dst, src, Address::times_8, 0));
5545     } else {
5546       addq(dst, src);
5547     }
5548   }
5549 }
5550 
5551 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5552   assert (UseCompressedOops, "should only be used for compressed headers");
5553   assert (Universe::heap() != NULL, "java heap should be initialized");
5554   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5555   int oop_index = oop_recorder()->find_index(obj);
5556   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5557   mov_narrow_oop(dst, oop_index, rspec);
5558 }
5559 
5560 void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
5561   assert (UseCompressedOops, "should only be used for compressed headers");
5562   assert (Universe::heap() != NULL, "java heap should be initialized");
5563   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5564   int oop_index = oop_recorder()->find_index(obj);
5565   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5566   mov_narrow_oop(dst, oop_index, rspec);
5567 }
5568 
5569 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5570   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5571   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5572   int klass_index = oop_recorder()->find_index(k);
5573   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5574   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5575 }
5576 
5577 void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
5578   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5579   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5580   int klass_index = oop_recorder()->find_index(k);
5581   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5582   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5583 }
5584 
5585 void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
5586   assert (UseCompressedOops, "should only be used for compressed headers");
5587   assert (Universe::heap() != NULL, "java heap should be initialized");
5588   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5589   int oop_index = oop_recorder()->find_index(obj);
5590   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5591   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5592 }
5593 
5594 void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
5595   assert (UseCompressedOops, "should only be used for compressed headers");
5596   assert (Universe::heap() != NULL, "java heap should be initialized");
5597   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5598   int oop_index = oop_recorder()->find_index(obj);
5599   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5600   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5601 }
5602 
5603 void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
5604   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5605   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5606   int klass_index = oop_recorder()->find_index(k);
5607   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5608   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5609 }
5610 
5611 void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
5612   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5613   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5614   int klass_index = oop_recorder()->find_index(k);
5615   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5616   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5617 }
5618 
5619 void MacroAssembler::reinit_heapbase() {
5620   if (UseCompressedOops || UseCompressedClassPointers) {
5621     if (Universe::heap() != NULL) {
5622       if (CompressedOops::base() == NULL) {
5623         MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
5624       } else {
5625         mov64(r12_heapbase, (int64_t)CompressedOops::ptrs_base());
5626       }
5627     } else {
5628       movptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
5629     }
5630   }
5631 }
5632 
5633 #endif // _LP64
5634 
5635 // C2 compiled method's prolog code.
5636 void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
5637 
5638   // WARNING: Initial instruction MUST be 5 bytes or longer so that
5639   // NativeJump::patch_verified_entry will be able to patch out the entry
5640   // code safely. The push to verify stack depth is ok at 5 bytes,
5641   // the frame allocation can be either 3 or 6 bytes. So if we don't do
5642   // stack bang then we must use the 6 byte frame allocation even if
5643   // we have no frame. :-(
5644   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
5645 
5646   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
5647   // Remove word for return addr
5648   framesize -= wordSize;
5649   stack_bang_size -= wordSize;
5650 
5651   // Calls to C2R adapters often do not accept exceptional returns.
5652   // We require that their callers must bang for them.  But be careful, because
5653   // some VM calls (such as call site linkage) can use several kilobytes of
5654   // stack.  But the stack safety zone should account for that.
5655   // See bugs 4446381, 4468289, 4497237.
5656   if (stack_bang_size > 0) {
5657     generate_stack_overflow_check(stack_bang_size);
5658 
5659     // We always push rbp, so that on return to interpreter rbp, will be
5660     // restored correctly and we can correct the stack.
5661     push(rbp);
5662     // Save caller's stack pointer into RBP if the frame pointer is preserved.
5663     if (PreserveFramePointer) {
5664       mov(rbp, rsp);
5665     }
5666     // Remove word for ebp
5667     framesize -= wordSize;
5668 
5669     // Create frame
5670     if (framesize) {
5671       subptr(rsp, framesize);
5672     }
5673   } else {
5674     // Create frame (force generation of a 4 byte immediate value)
5675     subptr_imm32(rsp, framesize);
5676 
5677     // Save RBP register now.
5678     framesize -= wordSize;
5679     movptr(Address(rsp, framesize), rbp);
5680     // Save caller's stack pointer into RBP if the frame pointer is preserved.
5681     if (PreserveFramePointer) {
5682       movptr(rbp, rsp);
5683       if (framesize > 0) {
5684         addptr(rbp, framesize);
5685       }
5686     }
5687   }
5688 
5689   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
5690     framesize -= wordSize;
5691     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
5692   }
5693 
5694 #ifndef _LP64
5695   // If method sets FPU control word do it now
5696   if (fp_mode_24b) {
5697     fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
5698   }
5699   if (UseSSE >= 2 && VerifyFPU) {
5700     verify_FPU(0, "FPU stack must be clean on entry");
5701   }
5702 #endif
5703 
5704 #ifdef ASSERT
5705   if (VerifyStackAtCalls) {
5706     Label L;
5707     push(rax);
5708     mov(rax, rsp);
5709     andptr(rax, StackAlignmentInBytes-1);
5710     cmpptr(rax, StackAlignmentInBytes-wordSize);
5711     pop(rax);
5712     jcc(Assembler::equal, L);
5713     STOP("Stack is not properly aligned!");
5714     bind(L);
5715   }
5716 #endif
5717 
5718   if (!is_stub) {
5719     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5720     bs->nmethod_entry_barrier(this);
5721   }
5722 }
5723 
5724 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers
5725 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, XMMRegister xtmp) {
5726   // cnt - number of qwords (8-byte words).
5727   // base - start address, qword aligned.
5728   Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
5729   if (UseAVX >= 2) {
5730     vpxor(xtmp, xtmp, xtmp, AVX_256bit);
5731   } else {
5732     pxor(xtmp, xtmp);
5733   }
5734   jmp(L_zero_64_bytes);
5735 
5736   BIND(L_loop);
5737   if (UseAVX >= 2) {
5738     vmovdqu(Address(base,  0), xtmp);
5739     vmovdqu(Address(base, 32), xtmp);
5740   } else {
5741     movdqu(Address(base,  0), xtmp);
5742     movdqu(Address(base, 16), xtmp);
5743     movdqu(Address(base, 32), xtmp);
5744     movdqu(Address(base, 48), xtmp);
5745   }
5746   addptr(base, 64);
5747 
5748   BIND(L_zero_64_bytes);
5749   subptr(cnt, 8);
5750   jccb(Assembler::greaterEqual, L_loop);
5751   addptr(cnt, 4);
5752   jccb(Assembler::less, L_tail);
5753   // Copy trailing 32 bytes
5754   if (UseAVX >= 2) {
5755     vmovdqu(Address(base, 0), xtmp);
5756   } else {
5757     movdqu(Address(base,  0), xtmp);
5758     movdqu(Address(base, 16), xtmp);
5759   }
5760   addptr(base, 32);
5761   subptr(cnt, 4);
5762 
5763   BIND(L_tail);
5764   addptr(cnt, 4);
5765   jccb(Assembler::lessEqual, L_end);
5766   decrement(cnt);
5767 
5768   BIND(L_sloop);
5769   movq(Address(base, 0), xtmp);
5770   addptr(base, 8);
5771   decrement(cnt);
5772   jccb(Assembler::greaterEqual, L_sloop);
5773   BIND(L_end);
5774 }
5775 
5776 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp, bool is_large) {
5777   // cnt - number of qwords (8-byte words).
5778   // base - start address, qword aligned.
5779   // is_large - if optimizers know cnt is larger than InitArrayShortSize
5780   assert(base==rdi, "base register must be edi for rep stos");
5781   assert(tmp==rax,   "tmp register must be eax for rep stos");
5782   assert(cnt==rcx,   "cnt register must be ecx for rep stos");
5783   assert(InitArrayShortSize % BytesPerLong == 0,
5784     "InitArrayShortSize should be the multiple of BytesPerLong");
5785 
5786   Label DONE;
5787 
5788   if (!is_large || !UseXMMForObjInit) {
5789     xorptr(tmp, tmp);
5790   }
5791 
5792   if (!is_large) {
5793     Label LOOP, LONG;
5794     cmpptr(cnt, InitArrayShortSize/BytesPerLong);
5795     jccb(Assembler::greater, LONG);
5796 
5797     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5798 
5799     decrement(cnt);
5800     jccb(Assembler::negative, DONE); // Zero length
5801 
5802     // Use individual pointer-sized stores for small counts:
5803     BIND(LOOP);
5804     movptr(Address(base, cnt, Address::times_ptr), tmp);
5805     decrement(cnt);
5806     jccb(Assembler::greaterEqual, LOOP);
5807     jmpb(DONE);
5808 
5809     BIND(LONG);
5810   }
5811 
5812   // Use longer rep-prefixed ops for non-small counts:
5813   if (UseFastStosb) {
5814     shlptr(cnt, 3); // convert to number of bytes
5815     rep_stosb();
5816   } else if (UseXMMForObjInit) {
5817     movptr(tmp, base);
5818     xmm_clear_mem(tmp, cnt, xtmp);
5819   } else {
5820     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5821     rep_stos();
5822   }
5823 
5824   BIND(DONE);
5825 }
5826 
5827 #ifdef COMPILER2
5828 
5829 // IndexOf for constant substrings with size >= 8 chars
5830 // which don't need to be loaded through stack.
5831 void MacroAssembler::string_indexofC8(Register str1, Register str2,
5832                                       Register cnt1, Register cnt2,
5833                                       int int_cnt2,  Register result,
5834                                       XMMRegister vec, Register tmp,
5835                                       int ae) {
5836   ShortBranchVerifier sbv(this);
5837   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
5838   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
5839 
5840   // This method uses the pcmpestri instruction with bound registers
5841   //   inputs:
5842   //     xmm - substring
5843   //     rax - substring length (elements count)
5844   //     mem - scanned string
5845   //     rdx - string length (elements count)
5846   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
5847   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
5848   //   outputs:
5849   //     rcx - matched index in string
5850   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
5851   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
5852   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
5853   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
5854   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
5855 
5856   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
5857         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
5858         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
5859 
5860   // Note, inline_string_indexOf() generates checks:
5861   // if (substr.count > string.count) return -1;
5862   // if (substr.count == 0) return 0;
5863   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
5864 
5865   // Load substring.
5866   if (ae == StrIntrinsicNode::UL) {
5867     pmovzxbw(vec, Address(str2, 0));
5868   } else {
5869     movdqu(vec, Address(str2, 0));
5870   }
5871   movl(cnt2, int_cnt2);
5872   movptr(result, str1); // string addr
5873 
5874   if (int_cnt2 > stride) {
5875     jmpb(SCAN_TO_SUBSTR);
5876 
5877     // Reload substr for rescan, this code
5878     // is executed only for large substrings (> 8 chars)
5879     bind(RELOAD_SUBSTR);
5880     if (ae == StrIntrinsicNode::UL) {
5881       pmovzxbw(vec, Address(str2, 0));
5882     } else {
5883       movdqu(vec, Address(str2, 0));
5884     }
5885     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
5886 
5887     bind(RELOAD_STR);
5888     // We came here after the beginning of the substring was
5889     // matched but the rest of it was not so we need to search
5890     // again. Start from the next element after the previous match.
5891 
5892     // cnt2 is number of substring reminding elements and
5893     // cnt1 is number of string reminding elements when cmp failed.
5894     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
5895     subl(cnt1, cnt2);
5896     addl(cnt1, int_cnt2);
5897     movl(cnt2, int_cnt2); // Now restore cnt2
5898 
5899     decrementl(cnt1);     // Shift to next element
5900     cmpl(cnt1, cnt2);
5901     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
5902 
5903     addptr(result, (1<<scale1));
5904 
5905   } // (int_cnt2 > 8)
5906 
5907   // Scan string for start of substr in 16-byte vectors
5908   bind(SCAN_TO_SUBSTR);
5909   pcmpestri(vec, Address(result, 0), mode);
5910   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
5911   subl(cnt1, stride);
5912   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
5913   cmpl(cnt1, cnt2);
5914   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
5915   addptr(result, 16);
5916   jmpb(SCAN_TO_SUBSTR);
5917 
5918   // Found a potential substr
5919   bind(FOUND_CANDIDATE);
5920   // Matched whole vector if first element matched (tmp(rcx) == 0).
5921   if (int_cnt2 == stride) {
5922     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
5923   } else { // int_cnt2 > 8
5924     jccb(Assembler::overflow, FOUND_SUBSTR);
5925   }
5926   // After pcmpestri tmp(rcx) contains matched element index
5927   // Compute start addr of substr
5928   lea(result, Address(result, tmp, scale1));
5929 
5930   // Make sure string is still long enough
5931   subl(cnt1, tmp);
5932   cmpl(cnt1, cnt2);
5933   if (int_cnt2 == stride) {
5934     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
5935   } else { // int_cnt2 > 8
5936     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
5937   }
5938   // Left less then substring.
5939 
5940   bind(RET_NOT_FOUND);
5941   movl(result, -1);
5942   jmp(EXIT);
5943 
5944   if (int_cnt2 > stride) {
5945     // This code is optimized for the case when whole substring
5946     // is matched if its head is matched.
5947     bind(MATCH_SUBSTR_HEAD);
5948     pcmpestri(vec, Address(result, 0), mode);
5949     // Reload only string if does not match
5950     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
5951 
5952     Label CONT_SCAN_SUBSTR;
5953     // Compare the rest of substring (> 8 chars).
5954     bind(FOUND_SUBSTR);
5955     // First 8 chars are already matched.
5956     negptr(cnt2);
5957     addptr(cnt2, stride);
5958 
5959     bind(SCAN_SUBSTR);
5960     subl(cnt1, stride);
5961     cmpl(cnt2, -stride); // Do not read beyond substring
5962     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
5963     // Back-up strings to avoid reading beyond substring:
5964     // cnt1 = cnt1 - cnt2 + 8
5965     addl(cnt1, cnt2); // cnt2 is negative
5966     addl(cnt1, stride);
5967     movl(cnt2, stride); negptr(cnt2);
5968     bind(CONT_SCAN_SUBSTR);
5969     if (int_cnt2 < (int)G) {
5970       int tail_off1 = int_cnt2<<scale1;
5971       int tail_off2 = int_cnt2<<scale2;
5972       if (ae == StrIntrinsicNode::UL) {
5973         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
5974       } else {
5975         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
5976       }
5977       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
5978     } else {
5979       // calculate index in register to avoid integer overflow (int_cnt2*2)
5980       movl(tmp, int_cnt2);
5981       addptr(tmp, cnt2);
5982       if (ae == StrIntrinsicNode::UL) {
5983         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
5984       } else {
5985         movdqu(vec, Address(str2, tmp, scale2, 0));
5986       }
5987       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
5988     }
5989     // Need to reload strings pointers if not matched whole vector
5990     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
5991     addptr(cnt2, stride);
5992     jcc(Assembler::negative, SCAN_SUBSTR);
5993     // Fall through if found full substring
5994 
5995   } // (int_cnt2 > 8)
5996 
5997   bind(RET_FOUND);
5998   // Found result if we matched full small substring.
5999   // Compute substr offset
6000   subptr(result, str1);
6001   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
6002     shrl(result, 1); // index
6003   }
6004   bind(EXIT);
6005 
6006 } // string_indexofC8
6007 
6008 // Small strings are loaded through stack if they cross page boundary.
6009 void MacroAssembler::string_indexof(Register str1, Register str2,
6010                                     Register cnt1, Register cnt2,
6011                                     int int_cnt2,  Register result,
6012                                     XMMRegister vec, Register tmp,
6013                                     int ae) {
6014   ShortBranchVerifier sbv(this);
6015   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
6016   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
6017 
6018   //
6019   // int_cnt2 is length of small (< 8 chars) constant substring
6020   // or (-1) for non constant substring in which case its length
6021   // is in cnt2 register.
6022   //
6023   // Note, inline_string_indexOf() generates checks:
6024   // if (substr.count > string.count) return -1;
6025   // if (substr.count == 0) return 0;
6026   //
6027   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
6028   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
6029   // This method uses the pcmpestri instruction with bound registers
6030   //   inputs:
6031   //     xmm - substring
6032   //     rax - substring length (elements count)
6033   //     mem - scanned string
6034   //     rdx - string length (elements count)
6035   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
6036   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
6037   //   outputs:
6038   //     rcx - matched index in string
6039   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6040   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
6041   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
6042   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
6043 
6044   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
6045         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
6046         FOUND_CANDIDATE;
6047 
6048   { //========================================================
6049     // We don't know where these strings are located
6050     // and we can't read beyond them. Load them through stack.
6051     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
6052 
6053     movptr(tmp, rsp); // save old SP
6054 
6055     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
6056       if (int_cnt2 == (1>>scale2)) { // One byte
6057         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
6058         load_unsigned_byte(result, Address(str2, 0));
6059         movdl(vec, result); // move 32 bits
6060       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
6061         // Not enough header space in 32-bit VM: 12+3 = 15.
6062         movl(result, Address(str2, -1));
6063         shrl(result, 8);
6064         movdl(vec, result); // move 32 bits
6065       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
6066         load_unsigned_short(result, Address(str2, 0));
6067         movdl(vec, result); // move 32 bits
6068       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
6069         movdl(vec, Address(str2, 0)); // move 32 bits
6070       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
6071         movq(vec, Address(str2, 0));  // move 64 bits
6072       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
6073         // Array header size is 12 bytes in 32-bit VM
6074         // + 6 bytes for 3 chars == 18 bytes,
6075         // enough space to load vec and shift.
6076         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
6077         if (ae == StrIntrinsicNode::UL) {
6078           int tail_off = int_cnt2-8;
6079           pmovzxbw(vec, Address(str2, tail_off));
6080           psrldq(vec, -2*tail_off);
6081         }
6082         else {
6083           int tail_off = int_cnt2*(1<<scale2);
6084           movdqu(vec, Address(str2, tail_off-16));
6085           psrldq(vec, 16-tail_off);
6086         }
6087       }
6088     } else { // not constant substring
6089       cmpl(cnt2, stride);
6090       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
6091 
6092       // We can read beyond string if srt+16 does not cross page boundary
6093       // since heaps are aligned and mapped by pages.
6094       assert(os::vm_page_size() < (int)G, "default page should be small");
6095       movl(result, str2); // We need only low 32 bits
6096       andl(result, (os::vm_page_size()-1));
6097       cmpl(result, (os::vm_page_size()-16));
6098       jccb(Assembler::belowEqual, CHECK_STR);
6099 
6100       // Move small strings to stack to allow load 16 bytes into vec.
6101       subptr(rsp, 16);
6102       int stk_offset = wordSize-(1<<scale2);
6103       push(cnt2);
6104 
6105       bind(COPY_SUBSTR);
6106       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
6107         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
6108         movb(Address(rsp, cnt2, scale2, stk_offset), result);
6109       } else if (ae == StrIntrinsicNode::UU) {
6110         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
6111         movw(Address(rsp, cnt2, scale2, stk_offset), result);
6112       }
6113       decrement(cnt2);
6114       jccb(Assembler::notZero, COPY_SUBSTR);
6115 
6116       pop(cnt2);
6117       movptr(str2, rsp);  // New substring address
6118     } // non constant
6119 
6120     bind(CHECK_STR);
6121     cmpl(cnt1, stride);
6122     jccb(Assembler::aboveEqual, BIG_STRINGS);
6123 
6124     // Check cross page boundary.
6125     movl(result, str1); // We need only low 32 bits
6126     andl(result, (os::vm_page_size()-1));
6127     cmpl(result, (os::vm_page_size()-16));
6128     jccb(Assembler::belowEqual, BIG_STRINGS);
6129 
6130     subptr(rsp, 16);
6131     int stk_offset = -(1<<scale1);
6132     if (int_cnt2 < 0) { // not constant
6133       push(cnt2);
6134       stk_offset += wordSize;
6135     }
6136     movl(cnt2, cnt1);
6137 
6138     bind(COPY_STR);
6139     if (ae == StrIntrinsicNode::LL) {
6140       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
6141       movb(Address(rsp, cnt2, scale1, stk_offset), result);
6142     } else {
6143       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
6144       movw(Address(rsp, cnt2, scale1, stk_offset), result);
6145     }
6146     decrement(cnt2);
6147     jccb(Assembler::notZero, COPY_STR);
6148 
6149     if (int_cnt2 < 0) { // not constant
6150       pop(cnt2);
6151     }
6152     movptr(str1, rsp);  // New string address
6153 
6154     bind(BIG_STRINGS);
6155     // Load substring.
6156     if (int_cnt2 < 0) { // -1
6157       if (ae == StrIntrinsicNode::UL) {
6158         pmovzxbw(vec, Address(str2, 0));
6159       } else {
6160         movdqu(vec, Address(str2, 0));
6161       }
6162       push(cnt2);       // substr count
6163       push(str2);       // substr addr
6164       push(str1);       // string addr
6165     } else {
6166       // Small (< 8 chars) constant substrings are loaded already.
6167       movl(cnt2, int_cnt2);
6168     }
6169     push(tmp);  // original SP
6170 
6171   } // Finished loading
6172 
6173   //========================================================
6174   // Start search
6175   //
6176 
6177   movptr(result, str1); // string addr
6178 
6179   if (int_cnt2  < 0) {  // Only for non constant substring
6180     jmpb(SCAN_TO_SUBSTR);
6181 
6182     // SP saved at sp+0
6183     // String saved at sp+1*wordSize
6184     // Substr saved at sp+2*wordSize
6185     // Substr count saved at sp+3*wordSize
6186 
6187     // Reload substr for rescan, this code
6188     // is executed only for large substrings (> 8 chars)
6189     bind(RELOAD_SUBSTR);
6190     movptr(str2, Address(rsp, 2*wordSize));
6191     movl(cnt2, Address(rsp, 3*wordSize));
6192     if (ae == StrIntrinsicNode::UL) {
6193       pmovzxbw(vec, Address(str2, 0));
6194     } else {
6195       movdqu(vec, Address(str2, 0));
6196     }
6197     // We came here after the beginning of the substring was
6198     // matched but the rest of it was not so we need to search
6199     // again. Start from the next element after the previous match.
6200     subptr(str1, result); // Restore counter
6201     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
6202       shrl(str1, 1);
6203     }
6204     addl(cnt1, str1);
6205     decrementl(cnt1);   // Shift to next element
6206     cmpl(cnt1, cnt2);
6207     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6208 
6209     addptr(result, (1<<scale1));
6210   } // non constant
6211 
6212   // Scan string for start of substr in 16-byte vectors
6213   bind(SCAN_TO_SUBSTR);
6214   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6215   pcmpestri(vec, Address(result, 0), mode);
6216   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
6217   subl(cnt1, stride);
6218   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
6219   cmpl(cnt1, cnt2);
6220   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6221   addptr(result, 16);
6222 
6223   bind(ADJUST_STR);
6224   cmpl(cnt1, stride); // Do not read beyond string
6225   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
6226   // Back-up string to avoid reading beyond string.
6227   lea(result, Address(result, cnt1, scale1, -16));
6228   movl(cnt1, stride);
6229   jmpb(SCAN_TO_SUBSTR);
6230 
6231   // Found a potential substr
6232   bind(FOUND_CANDIDATE);
6233   // After pcmpestri tmp(rcx) contains matched element index
6234 
6235   // Make sure string is still long enough
6236   subl(cnt1, tmp);
6237   cmpl(cnt1, cnt2);
6238   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
6239   // Left less then substring.
6240 
6241   bind(RET_NOT_FOUND);
6242   movl(result, -1);
6243   jmp(CLEANUP);
6244 
6245   bind(FOUND_SUBSTR);
6246   // Compute start addr of substr
6247   lea(result, Address(result, tmp, scale1));
6248   if (int_cnt2 > 0) { // Constant substring
6249     // Repeat search for small substring (< 8 chars)
6250     // from new point without reloading substring.
6251     // Have to check that we don't read beyond string.
6252     cmpl(tmp, stride-int_cnt2);
6253     jccb(Assembler::greater, ADJUST_STR);
6254     // Fall through if matched whole substring.
6255   } else { // non constant
6256     assert(int_cnt2 == -1, "should be != 0");
6257 
6258     addl(tmp, cnt2);
6259     // Found result if we matched whole substring.
6260     cmpl(tmp, stride);
6261     jcc(Assembler::lessEqual, RET_FOUND);
6262 
6263     // Repeat search for small substring (<= 8 chars)
6264     // from new point 'str1' without reloading substring.
6265     cmpl(cnt2, stride);
6266     // Have to check that we don't read beyond string.
6267     jccb(Assembler::lessEqual, ADJUST_STR);
6268 
6269     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
6270     // Compare the rest of substring (> 8 chars).
6271     movptr(str1, result);
6272 
6273     cmpl(tmp, cnt2);
6274     // First 8 chars are already matched.
6275     jccb(Assembler::equal, CHECK_NEXT);
6276 
6277     bind(SCAN_SUBSTR);
6278     pcmpestri(vec, Address(str1, 0), mode);
6279     // Need to reload strings pointers if not matched whole vector
6280     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
6281 
6282     bind(CHECK_NEXT);
6283     subl(cnt2, stride);
6284     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
6285     addptr(str1, 16);
6286     if (ae == StrIntrinsicNode::UL) {
6287       addptr(str2, 8);
6288     } else {
6289       addptr(str2, 16);
6290     }
6291     subl(cnt1, stride);
6292     cmpl(cnt2, stride); // Do not read beyond substring
6293     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
6294     // Back-up strings to avoid reading beyond substring.
6295 
6296     if (ae == StrIntrinsicNode::UL) {
6297       lea(str2, Address(str2, cnt2, scale2, -8));
6298       lea(str1, Address(str1, cnt2, scale1, -16));
6299     } else {
6300       lea(str2, Address(str2, cnt2, scale2, -16));
6301       lea(str1, Address(str1, cnt2, scale1, -16));
6302     }
6303     subl(cnt1, cnt2);
6304     movl(cnt2, stride);
6305     addl(cnt1, stride);
6306     bind(CONT_SCAN_SUBSTR);
6307     if (ae == StrIntrinsicNode::UL) {
6308       pmovzxbw(vec, Address(str2, 0));
6309     } else {
6310       movdqu(vec, Address(str2, 0));
6311     }
6312     jmp(SCAN_SUBSTR);
6313 
6314     bind(RET_FOUND_LONG);
6315     movptr(str1, Address(rsp, wordSize));
6316   } // non constant
6317 
6318   bind(RET_FOUND);
6319   // Compute substr offset
6320   subptr(result, str1);
6321   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
6322     shrl(result, 1); // index
6323   }
6324   bind(CLEANUP);
6325   pop(rsp); // restore SP
6326 
6327 } // string_indexof
6328 
6329 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
6330                                          XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
6331   ShortBranchVerifier sbv(this);
6332   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
6333 
6334   int stride = 8;
6335 
6336   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
6337         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
6338         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
6339         FOUND_SEQ_CHAR, DONE_LABEL;
6340 
6341   movptr(result, str1);
6342   if (UseAVX >= 2) {
6343     cmpl(cnt1, stride);
6344     jcc(Assembler::less, SCAN_TO_CHAR_LOOP);
6345     cmpl(cnt1, 2*stride);
6346     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
6347     movdl(vec1, ch);
6348     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
6349     vpxor(vec2, vec2);
6350     movl(tmp, cnt1);
6351     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
6352     andl(cnt1,0x0000000F);  //tail count (in chars)
6353 
6354     bind(SCAN_TO_16_CHAR_LOOP);
6355     vmovdqu(vec3, Address(result, 0));
6356     vpcmpeqw(vec3, vec3, vec1, 1);
6357     vptest(vec2, vec3);
6358     jcc(Assembler::carryClear, FOUND_CHAR);
6359     addptr(result, 32);
6360     subl(tmp, 2*stride);
6361     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
6362     jmp(SCAN_TO_8_CHAR);
6363     bind(SCAN_TO_8_CHAR_INIT);
6364     movdl(vec1, ch);
6365     pshuflw(vec1, vec1, 0x00);
6366     pshufd(vec1, vec1, 0);
6367     pxor(vec2, vec2);
6368   }
6369   bind(SCAN_TO_8_CHAR);
6370   cmpl(cnt1, stride);
6371   if (UseAVX >= 2) {
6372     jcc(Assembler::less, SCAN_TO_CHAR);
6373   } else {
6374     jcc(Assembler::less, SCAN_TO_CHAR_LOOP);
6375     movdl(vec1, ch);
6376     pshuflw(vec1, vec1, 0x00);
6377     pshufd(vec1, vec1, 0);
6378     pxor(vec2, vec2);
6379   }
6380   movl(tmp, cnt1);
6381   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
6382   andl(cnt1,0x00000007);  //tail count (in chars)
6383 
6384   bind(SCAN_TO_8_CHAR_LOOP);
6385   movdqu(vec3, Address(result, 0));
6386   pcmpeqw(vec3, vec1);
6387   ptest(vec2, vec3);
6388   jcc(Assembler::carryClear, FOUND_CHAR);
6389   addptr(result, 16);
6390   subl(tmp, stride);
6391   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
6392   bind(SCAN_TO_CHAR);
6393   testl(cnt1, cnt1);
6394   jcc(Assembler::zero, RET_NOT_FOUND);
6395   bind(SCAN_TO_CHAR_LOOP);
6396   load_unsigned_short(tmp, Address(result, 0));
6397   cmpl(ch, tmp);
6398   jccb(Assembler::equal, FOUND_SEQ_CHAR);
6399   addptr(result, 2);
6400   subl(cnt1, 1);
6401   jccb(Assembler::zero, RET_NOT_FOUND);
6402   jmp(SCAN_TO_CHAR_LOOP);
6403 
6404   bind(RET_NOT_FOUND);
6405   movl(result, -1);
6406   jmpb(DONE_LABEL);
6407 
6408   bind(FOUND_CHAR);
6409   if (UseAVX >= 2) {
6410     vpmovmskb(tmp, vec3);
6411   } else {
6412     pmovmskb(tmp, vec3);
6413   }
6414   bsfl(ch, tmp);
6415   addl(result, ch);
6416 
6417   bind(FOUND_SEQ_CHAR);
6418   subptr(result, str1);
6419   shrl(result, 1);
6420 
6421   bind(DONE_LABEL);
6422 } // string_indexof_char
6423 
6424 // helper function for string_compare
6425 void MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
6426                                         Address::ScaleFactor scale, Address::ScaleFactor scale1,
6427                                         Address::ScaleFactor scale2, Register index, int ae) {
6428   if (ae == StrIntrinsicNode::LL) {
6429     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
6430     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
6431   } else if (ae == StrIntrinsicNode::UU) {
6432     load_unsigned_short(elem1, Address(str1, index, scale, 0));
6433     load_unsigned_short(elem2, Address(str2, index, scale, 0));
6434   } else {
6435     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
6436     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
6437   }
6438 }
6439 
6440 // Compare strings, used for char[] and byte[].
6441 void MacroAssembler::string_compare(Register str1, Register str2,
6442                                     Register cnt1, Register cnt2, Register result,
6443                                     XMMRegister vec1, int ae) {
6444   ShortBranchVerifier sbv(this);
6445   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
6446   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
6447   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
6448   int stride2x2 = 0x40;
6449   Address::ScaleFactor scale = Address::no_scale;
6450   Address::ScaleFactor scale1 = Address::no_scale;
6451   Address::ScaleFactor scale2 = Address::no_scale;
6452 
6453   if (ae != StrIntrinsicNode::LL) {
6454     stride2x2 = 0x20;
6455   }
6456 
6457   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
6458     shrl(cnt2, 1);
6459   }
6460   // Compute the minimum of the string lengths and the
6461   // difference of the string lengths (stack).
6462   // Do the conditional move stuff
6463   movl(result, cnt1);
6464   subl(cnt1, cnt2);
6465   push(cnt1);
6466   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
6467 
6468   // Is the minimum length zero?
6469   testl(cnt2, cnt2);
6470   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
6471   if (ae == StrIntrinsicNode::LL) {
6472     // Load first bytes
6473     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
6474     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
6475   } else if (ae == StrIntrinsicNode::UU) {
6476     // Load first characters
6477     load_unsigned_short(result, Address(str1, 0));
6478     load_unsigned_short(cnt1, Address(str2, 0));
6479   } else {
6480     load_unsigned_byte(result, Address(str1, 0));
6481     load_unsigned_short(cnt1, Address(str2, 0));
6482   }
6483   subl(result, cnt1);
6484   jcc(Assembler::notZero,  POP_LABEL);
6485 
6486   if (ae == StrIntrinsicNode::UU) {
6487     // Divide length by 2 to get number of chars
6488     shrl(cnt2, 1);
6489   }
6490   cmpl(cnt2, 1);
6491   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
6492 
6493   // Check if the strings start at the same location and setup scale and stride
6494   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6495     cmpptr(str1, str2);
6496     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
6497     if (ae == StrIntrinsicNode::LL) {
6498       scale = Address::times_1;
6499       stride = 16;
6500     } else {
6501       scale = Address::times_2;
6502       stride = 8;
6503     }
6504   } else {
6505     scale1 = Address::times_1;
6506     scale2 = Address::times_2;
6507     // scale not used
6508     stride = 8;
6509   }
6510 
6511   if (UseAVX >= 2 && UseSSE42Intrinsics) {
6512     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
6513     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
6514     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
6515     Label COMPARE_TAIL_LONG;
6516     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
6517 
6518     int pcmpmask = 0x19;
6519     if (ae == StrIntrinsicNode::LL) {
6520       pcmpmask &= ~0x01;
6521     }
6522 
6523     // Setup to compare 16-chars (32-bytes) vectors,
6524     // start from first character again because it has aligned address.
6525     if (ae == StrIntrinsicNode::LL) {
6526       stride2 = 32;
6527     } else {
6528       stride2 = 16;
6529     }
6530     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6531       adr_stride = stride << scale;
6532     } else {
6533       adr_stride1 = 8;  //stride << scale1;
6534       adr_stride2 = 16; //stride << scale2;
6535     }
6536 
6537     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
6538     // rax and rdx are used by pcmpestri as elements counters
6539     movl(result, cnt2);
6540     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
6541     jcc(Assembler::zero, COMPARE_TAIL_LONG);
6542 
6543     // fast path : compare first 2 8-char vectors.
6544     bind(COMPARE_16_CHARS);
6545     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6546       movdqu(vec1, Address(str1, 0));
6547     } else {
6548       pmovzxbw(vec1, Address(str1, 0));
6549     }
6550     pcmpestri(vec1, Address(str2, 0), pcmpmask);
6551     jccb(Assembler::below, COMPARE_INDEX_CHAR);
6552 
6553     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6554       movdqu(vec1, Address(str1, adr_stride));
6555       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
6556     } else {
6557       pmovzxbw(vec1, Address(str1, adr_stride1));
6558       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
6559     }
6560     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
6561     addl(cnt1, stride);
6562 
6563     // Compare the characters at index in cnt1
6564     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
6565     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
6566     subl(result, cnt2);
6567     jmp(POP_LABEL);
6568 
6569     // Setup the registers to start vector comparison loop
6570     bind(COMPARE_WIDE_VECTORS);
6571     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6572       lea(str1, Address(str1, result, scale));
6573       lea(str2, Address(str2, result, scale));
6574     } else {
6575       lea(str1, Address(str1, result, scale1));
6576       lea(str2, Address(str2, result, scale2));
6577     }
6578     subl(result, stride2);
6579     subl(cnt2, stride2);
6580     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
6581     negptr(result);
6582 
6583     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
6584     bind(COMPARE_WIDE_VECTORS_LOOP);
6585 
6586 #ifdef _LP64
6587     if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
6588       cmpl(cnt2, stride2x2);
6589       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
6590       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
6591       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
6592 
6593       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
6594       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6595         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
6596         evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
6597       } else {
6598         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
6599         evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
6600       }
6601       kortestql(k7, k7);
6602       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
6603       addptr(result, stride2x2);  // update since we already compared at this addr
6604       subl(cnt2, stride2x2);      // and sub the size too
6605       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
6606 
6607       vpxor(vec1, vec1);
6608       jmpb(COMPARE_WIDE_TAIL);
6609     }//if (VM_Version::supports_avx512vlbw())
6610 #endif // _LP64
6611 
6612 
6613     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
6614     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6615       vmovdqu(vec1, Address(str1, result, scale));
6616       vpxor(vec1, Address(str2, result, scale));
6617     } else {
6618       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
6619       vpxor(vec1, Address(str2, result, scale2));
6620     }
6621     vptest(vec1, vec1);
6622     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
6623     addptr(result, stride2);
6624     subl(cnt2, stride2);
6625     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
6626     // clean upper bits of YMM registers
6627     vpxor(vec1, vec1);
6628 
6629     // compare wide vectors tail
6630     bind(COMPARE_WIDE_TAIL);
6631     testptr(result, result);
6632     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
6633 
6634     movl(result, stride2);
6635     movl(cnt2, result);
6636     negptr(result);
6637     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
6638 
6639     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
6640     bind(VECTOR_NOT_EQUAL);
6641     // clean upper bits of YMM registers
6642     vpxor(vec1, vec1);
6643     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6644       lea(str1, Address(str1, result, scale));
6645       lea(str2, Address(str2, result, scale));
6646     } else {
6647       lea(str1, Address(str1, result, scale1));
6648       lea(str2, Address(str2, result, scale2));
6649     }
6650     jmp(COMPARE_16_CHARS);
6651 
6652     // Compare tail chars, length between 1 to 15 chars
6653     bind(COMPARE_TAIL_LONG);
6654     movl(cnt2, result);
6655     cmpl(cnt2, stride);
6656     jcc(Assembler::less, COMPARE_SMALL_STR);
6657 
6658     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6659       movdqu(vec1, Address(str1, 0));
6660     } else {
6661       pmovzxbw(vec1, Address(str1, 0));
6662     }
6663     pcmpestri(vec1, Address(str2, 0), pcmpmask);
6664     jcc(Assembler::below, COMPARE_INDEX_CHAR);
6665     subptr(cnt2, stride);
6666     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
6667     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6668       lea(str1, Address(str1, result, scale));
6669       lea(str2, Address(str2, result, scale));
6670     } else {
6671       lea(str1, Address(str1, result, scale1));
6672       lea(str2, Address(str2, result, scale2));
6673     }
6674     negptr(cnt2);
6675     jmpb(WHILE_HEAD_LABEL);
6676 
6677     bind(COMPARE_SMALL_STR);
6678   } else if (UseSSE42Intrinsics) {
6679     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
6680     int pcmpmask = 0x19;
6681     // Setup to compare 8-char (16-byte) vectors,
6682     // start from first character again because it has aligned address.
6683     movl(result, cnt2);
6684     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
6685     if (ae == StrIntrinsicNode::LL) {
6686       pcmpmask &= ~0x01;
6687     }
6688     jcc(Assembler::zero, COMPARE_TAIL);
6689     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6690       lea(str1, Address(str1, result, scale));
6691       lea(str2, Address(str2, result, scale));
6692     } else {
6693       lea(str1, Address(str1, result, scale1));
6694       lea(str2, Address(str2, result, scale2));
6695     }
6696     negptr(result);
6697 
6698     // pcmpestri
6699     //   inputs:
6700     //     vec1- substring
6701     //     rax - negative string length (elements count)
6702     //     mem - scanned string
6703     //     rdx - string length (elements count)
6704     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
6705     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
6706     //   outputs:
6707     //     rcx - first mismatched element index
6708     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
6709 
6710     bind(COMPARE_WIDE_VECTORS);
6711     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6712       movdqu(vec1, Address(str1, result, scale));
6713       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
6714     } else {
6715       pmovzxbw(vec1, Address(str1, result, scale1));
6716       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
6717     }
6718     // After pcmpestri cnt1(rcx) contains mismatched element index
6719 
6720     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
6721     addptr(result, stride);
6722     subptr(cnt2, stride);
6723     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
6724 
6725     // compare wide vectors tail
6726     testptr(result, result);
6727     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
6728 
6729     movl(cnt2, stride);
6730     movl(result, stride);
6731     negptr(result);
6732     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6733       movdqu(vec1, Address(str1, result, scale));
6734       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
6735     } else {
6736       pmovzxbw(vec1, Address(str1, result, scale1));
6737       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
6738     }
6739     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
6740 
6741     // Mismatched characters in the vectors
6742     bind(VECTOR_NOT_EQUAL);
6743     addptr(cnt1, result);
6744     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
6745     subl(result, cnt2);
6746     jmpb(POP_LABEL);
6747 
6748     bind(COMPARE_TAIL); // limit is zero
6749     movl(cnt2, result);
6750     // Fallthru to tail compare
6751   }
6752   // Shift str2 and str1 to the end of the arrays, negate min
6753   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6754     lea(str1, Address(str1, cnt2, scale));
6755     lea(str2, Address(str2, cnt2, scale));
6756   } else {
6757     lea(str1, Address(str1, cnt2, scale1));
6758     lea(str2, Address(str2, cnt2, scale2));
6759   }
6760   decrementl(cnt2);  // first character was compared already
6761   negptr(cnt2);
6762 
6763   // Compare the rest of the elements
6764   bind(WHILE_HEAD_LABEL);
6765   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
6766   subl(result, cnt1);
6767   jccb(Assembler::notZero, POP_LABEL);
6768   increment(cnt2);
6769   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
6770 
6771   // Strings are equal up to min length.  Return the length difference.
6772   bind(LENGTH_DIFF_LABEL);
6773   pop(result);
6774   if (ae == StrIntrinsicNode::UU) {
6775     // Divide diff by 2 to get number of chars
6776     sarl(result, 1);
6777   }
6778   jmpb(DONE_LABEL);
6779 
6780 #ifdef _LP64
6781   if (VM_Version::supports_avx512vlbw()) {
6782 
6783     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
6784 
6785     kmovql(cnt1, k7);
6786     notq(cnt1);
6787     bsfq(cnt2, cnt1);
6788     if (ae != StrIntrinsicNode::LL) {
6789       // Divide diff by 2 to get number of chars
6790       sarl(cnt2, 1);
6791     }
6792     addq(result, cnt2);
6793     if (ae == StrIntrinsicNode::LL) {
6794       load_unsigned_byte(cnt1, Address(str2, result));
6795       load_unsigned_byte(result, Address(str1, result));
6796     } else if (ae == StrIntrinsicNode::UU) {
6797       load_unsigned_short(cnt1, Address(str2, result, scale));
6798       load_unsigned_short(result, Address(str1, result, scale));
6799     } else {
6800       load_unsigned_short(cnt1, Address(str2, result, scale2));
6801       load_unsigned_byte(result, Address(str1, result, scale1));
6802     }
6803     subl(result, cnt1);
6804     jmpb(POP_LABEL);
6805   }//if (VM_Version::supports_avx512vlbw())
6806 #endif // _LP64
6807 
6808   // Discard the stored length difference
6809   bind(POP_LABEL);
6810   pop(cnt1);
6811 
6812   // That's it
6813   bind(DONE_LABEL);
6814   if(ae == StrIntrinsicNode::UL) {
6815     negl(result);
6816   }
6817 
6818 }
6819 
6820 // Search for Non-ASCII character (Negative byte value) in a byte array,
6821 // return true if it has any and false otherwise.
6822 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
6823 //   @HotSpotIntrinsicCandidate
6824 //   private static boolean hasNegatives(byte[] ba, int off, int len) {
6825 //     for (int i = off; i < off + len; i++) {
6826 //       if (ba[i] < 0) {
6827 //         return true;
6828 //       }
6829 //     }
6830 //     return false;
6831 //   }
6832 void MacroAssembler::has_negatives(Register ary1, Register len,
6833   Register result, Register tmp1,
6834   XMMRegister vec1, XMMRegister vec2) {
6835   // rsi: byte array
6836   // rcx: len
6837   // rax: result
6838   ShortBranchVerifier sbv(this);
6839   assert_different_registers(ary1, len, result, tmp1);
6840   assert_different_registers(vec1, vec2);
6841   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
6842 
6843   // len == 0
6844   testl(len, len);
6845   jcc(Assembler::zero, FALSE_LABEL);
6846 
6847   if ((UseAVX > 2) && // AVX512
6848     VM_Version::supports_avx512vlbw() &&
6849     VM_Version::supports_bmi2()) {
6850 
6851     Label test_64_loop, test_tail;
6852     Register tmp3_aliased = len;
6853 
6854     movl(tmp1, len);
6855     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
6856 
6857     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
6858     andl(len, ~(64 - 1));    // vector count (in chars)
6859     jccb(Assembler::zero, test_tail);
6860 
6861     lea(ary1, Address(ary1, len, Address::times_1));
6862     negptr(len);
6863 
6864     bind(test_64_loop);
6865     // Check whether our 64 elements of size byte contain negatives
6866     evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
6867     kortestql(k2, k2);
6868     jcc(Assembler::notZero, TRUE_LABEL);
6869 
6870     addptr(len, 64);
6871     jccb(Assembler::notZero, test_64_loop);
6872 
6873 
6874     bind(test_tail);
6875     // bail out when there is nothing to be done
6876     testl(tmp1, -1);
6877     jcc(Assembler::zero, FALSE_LABEL);
6878 
6879     // ~(~0 << len) applied up to two times (for 32-bit scenario)
6880 #ifdef _LP64
6881     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
6882     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
6883     notq(tmp3_aliased);
6884     kmovql(k3, tmp3_aliased);
6885 #else
6886     Label k_init;
6887     jmp(k_init);
6888 
6889     // We could not read 64-bits from a general purpose register thus we move
6890     // data required to compose 64 1's to the instruction stream
6891     // We emit 64 byte wide series of elements from 0..63 which later on would
6892     // be used as a compare targets with tail count contained in tmp1 register.
6893     // Result would be a k register having tmp1 consecutive number or 1
6894     // counting from least significant bit.
6895     address tmp = pc();
6896     emit_int64(0x0706050403020100);
6897     emit_int64(0x0F0E0D0C0B0A0908);
6898     emit_int64(0x1716151413121110);
6899     emit_int64(0x1F1E1D1C1B1A1918);
6900     emit_int64(0x2726252423222120);
6901     emit_int64(0x2F2E2D2C2B2A2928);
6902     emit_int64(0x3736353433323130);
6903     emit_int64(0x3F3E3D3C3B3A3938);
6904 
6905     bind(k_init);
6906     lea(len, InternalAddress(tmp));
6907     // create mask to test for negative byte inside a vector
6908     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
6909     evpcmpgtb(k3, vec1, Address(len, 0), Assembler::AVX_512bit);
6910 
6911 #endif
6912     evpcmpgtb(k2, k3, vec2, Address(ary1, 0), Assembler::AVX_512bit);
6913     ktestq(k2, k3);
6914     jcc(Assembler::notZero, TRUE_LABEL);
6915 
6916     jmp(FALSE_LABEL);
6917   } else {
6918     movl(result, len); // copy
6919 
6920     if (UseAVX == 2 && UseSSE >= 2) {
6921       // With AVX2, use 32-byte vector compare
6922       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
6923 
6924       // Compare 32-byte vectors
6925       andl(result, 0x0000001f);  //   tail count (in bytes)
6926       andl(len, 0xffffffe0);   // vector count (in bytes)
6927       jccb(Assembler::zero, COMPARE_TAIL);
6928 
6929       lea(ary1, Address(ary1, len, Address::times_1));
6930       negptr(len);
6931 
6932       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
6933       movdl(vec2, tmp1);
6934       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
6935 
6936       bind(COMPARE_WIDE_VECTORS);
6937       vmovdqu(vec1, Address(ary1, len, Address::times_1));
6938       vptest(vec1, vec2);
6939       jccb(Assembler::notZero, TRUE_LABEL);
6940       addptr(len, 32);
6941       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
6942 
6943       testl(result, result);
6944       jccb(Assembler::zero, FALSE_LABEL);
6945 
6946       vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
6947       vptest(vec1, vec2);
6948       jccb(Assembler::notZero, TRUE_LABEL);
6949       jmpb(FALSE_LABEL);
6950 
6951       bind(COMPARE_TAIL); // len is zero
6952       movl(len, result);
6953       // Fallthru to tail compare
6954     } else if (UseSSE42Intrinsics) {
6955       // With SSE4.2, use double quad vector compare
6956       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
6957 
6958       // Compare 16-byte vectors
6959       andl(result, 0x0000000f);  //   tail count (in bytes)
6960       andl(len, 0xfffffff0);   // vector count (in bytes)
6961       jcc(Assembler::zero, COMPARE_TAIL);
6962 
6963       lea(ary1, Address(ary1, len, Address::times_1));
6964       negptr(len);
6965 
6966       movl(tmp1, 0x80808080);
6967       movdl(vec2, tmp1);
6968       pshufd(vec2, vec2, 0);
6969 
6970       bind(COMPARE_WIDE_VECTORS);
6971       movdqu(vec1, Address(ary1, len, Address::times_1));
6972       ptest(vec1, vec2);
6973       jcc(Assembler::notZero, TRUE_LABEL);
6974       addptr(len, 16);
6975       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
6976 
6977       testl(result, result);
6978       jcc(Assembler::zero, FALSE_LABEL);
6979 
6980       movdqu(vec1, Address(ary1, result, Address::times_1, -16));
6981       ptest(vec1, vec2);
6982       jccb(Assembler::notZero, TRUE_LABEL);
6983       jmpb(FALSE_LABEL);
6984 
6985       bind(COMPARE_TAIL); // len is zero
6986       movl(len, result);
6987       // Fallthru to tail compare
6988     }
6989   }
6990   // Compare 4-byte vectors
6991   andl(len, 0xfffffffc); // vector count (in bytes)
6992   jccb(Assembler::zero, COMPARE_CHAR);
6993 
6994   lea(ary1, Address(ary1, len, Address::times_1));
6995   negptr(len);
6996 
6997   bind(COMPARE_VECTORS);
6998   movl(tmp1, Address(ary1, len, Address::times_1));
6999   andl(tmp1, 0x80808080);
7000   jccb(Assembler::notZero, TRUE_LABEL);
7001   addptr(len, 4);
7002   jcc(Assembler::notZero, COMPARE_VECTORS);
7003 
7004   // Compare trailing char (final 2 bytes), if any
7005   bind(COMPARE_CHAR);
7006   testl(result, 0x2);   // tail  char
7007   jccb(Assembler::zero, COMPARE_BYTE);
7008   load_unsigned_short(tmp1, Address(ary1, 0));
7009   andl(tmp1, 0x00008080);
7010   jccb(Assembler::notZero, TRUE_LABEL);
7011   subptr(result, 2);
7012   lea(ary1, Address(ary1, 2));
7013 
7014   bind(COMPARE_BYTE);
7015   testl(result, 0x1);   // tail  byte
7016   jccb(Assembler::zero, FALSE_LABEL);
7017   load_unsigned_byte(tmp1, Address(ary1, 0));
7018   andl(tmp1, 0x00000080);
7019   jccb(Assembler::notEqual, TRUE_LABEL);
7020   jmpb(FALSE_LABEL);
7021 
7022   bind(TRUE_LABEL);
7023   movl(result, 1);   // return true
7024   jmpb(DONE);
7025 
7026   bind(FALSE_LABEL);
7027   xorl(result, result); // return false
7028 
7029   // That's it
7030   bind(DONE);
7031   if (UseAVX >= 2 && UseSSE >= 2) {
7032     // clean upper bits of YMM registers
7033     vpxor(vec1, vec1);
7034     vpxor(vec2, vec2);
7035   }
7036 }
7037 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
7038 void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
7039                                    Register limit, Register result, Register chr,
7040                                    XMMRegister vec1, XMMRegister vec2, bool is_char) {
7041   ShortBranchVerifier sbv(this);
7042   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
7043 
7044   int length_offset  = arrayOopDesc::length_offset_in_bytes();
7045   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
7046 
7047   if (is_array_equ) {
7048     // Check the input args
7049     cmpoop(ary1, ary2);
7050     jcc(Assembler::equal, TRUE_LABEL);
7051 
7052     // Need additional checks for arrays_equals.
7053     testptr(ary1, ary1);
7054     jcc(Assembler::zero, FALSE_LABEL);
7055     testptr(ary2, ary2);
7056     jcc(Assembler::zero, FALSE_LABEL);
7057 
7058     // Check the lengths
7059     movl(limit, Address(ary1, length_offset));
7060     cmpl(limit, Address(ary2, length_offset));
7061     jcc(Assembler::notEqual, FALSE_LABEL);
7062   }
7063 
7064   // count == 0
7065   testl(limit, limit);
7066   jcc(Assembler::zero, TRUE_LABEL);
7067 
7068   if (is_array_equ) {
7069     // Load array address
7070     lea(ary1, Address(ary1, base_offset));
7071     lea(ary2, Address(ary2, base_offset));
7072   }
7073 
7074   if (is_array_equ && is_char) {
7075     // arrays_equals when used for char[].
7076     shll(limit, 1);      // byte count != 0
7077   }
7078   movl(result, limit); // copy
7079 
7080   if (UseAVX >= 2) {
7081     // With AVX2, use 32-byte vector compare
7082     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7083 
7084     // Compare 32-byte vectors
7085     andl(result, 0x0000001f);  //   tail count (in bytes)
7086     andl(limit, 0xffffffe0);   // vector count (in bytes)
7087     jcc(Assembler::zero, COMPARE_TAIL);
7088 
7089     lea(ary1, Address(ary1, limit, Address::times_1));
7090     lea(ary2, Address(ary2, limit, Address::times_1));
7091     negptr(limit);
7092 
7093     bind(COMPARE_WIDE_VECTORS);
7094 
7095 #ifdef _LP64
7096     if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
7097       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
7098 
7099       cmpl(limit, -64);
7100       jccb(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
7101 
7102       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
7103 
7104       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
7105       evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
7106       kortestql(k7, k7);
7107       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
7108       addptr(limit, 64);  // update since we already compared at this addr
7109       cmpl(limit, -64);
7110       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
7111 
7112       // At this point we may still need to compare -limit+result bytes.
7113       // We could execute the next two instruction and just continue via non-wide path:
7114       //  cmpl(limit, 0);
7115       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
7116       // But since we stopped at the points ary{1,2}+limit which are
7117       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
7118       // (|limit| <= 32 and result < 32),
7119       // we may just compare the last 64 bytes.
7120       //
7121       addptr(result, -64);   // it is safe, bc we just came from this area
7122       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
7123       evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
7124       kortestql(k7, k7);
7125       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
7126 
7127       jmp(TRUE_LABEL);
7128 
7129       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
7130 
7131     }//if (VM_Version::supports_avx512vlbw())
7132 #endif //_LP64
7133 
7134     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
7135     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
7136     vpxor(vec1, vec2);
7137 
7138     vptest(vec1, vec1);
7139     jcc(Assembler::notZero, FALSE_LABEL);
7140     addptr(limit, 32);
7141     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7142 
7143     testl(result, result);
7144     jcc(Assembler::zero, TRUE_LABEL);
7145 
7146     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
7147     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
7148     vpxor(vec1, vec2);
7149 
7150     vptest(vec1, vec1);
7151     jccb(Assembler::notZero, FALSE_LABEL);
7152     jmpb(TRUE_LABEL);
7153 
7154     bind(COMPARE_TAIL); // limit is zero
7155     movl(limit, result);
7156     // Fallthru to tail compare
7157   } else if (UseSSE42Intrinsics) {
7158     // With SSE4.2, use double quad vector compare
7159     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7160 
7161     // Compare 16-byte vectors
7162     andl(result, 0x0000000f);  //   tail count (in bytes)
7163     andl(limit, 0xfffffff0);   // vector count (in bytes)
7164     jcc(Assembler::zero, COMPARE_TAIL);
7165 
7166     lea(ary1, Address(ary1, limit, Address::times_1));
7167     lea(ary2, Address(ary2, limit, Address::times_1));
7168     negptr(limit);
7169 
7170     bind(COMPARE_WIDE_VECTORS);
7171     movdqu(vec1, Address(ary1, limit, Address::times_1));
7172     movdqu(vec2, Address(ary2, limit, Address::times_1));
7173     pxor(vec1, vec2);
7174 
7175     ptest(vec1, vec1);
7176     jcc(Assembler::notZero, FALSE_LABEL);
7177     addptr(limit, 16);
7178     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7179 
7180     testl(result, result);
7181     jcc(Assembler::zero, TRUE_LABEL);
7182 
7183     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
7184     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
7185     pxor(vec1, vec2);
7186 
7187     ptest(vec1, vec1);
7188     jccb(Assembler::notZero, FALSE_LABEL);
7189     jmpb(TRUE_LABEL);
7190 
7191     bind(COMPARE_TAIL); // limit is zero
7192     movl(limit, result);
7193     // Fallthru to tail compare
7194   }
7195 
7196   // Compare 4-byte vectors
7197   andl(limit, 0xfffffffc); // vector count (in bytes)
7198   jccb(Assembler::zero, COMPARE_CHAR);
7199 
7200   lea(ary1, Address(ary1, limit, Address::times_1));
7201   lea(ary2, Address(ary2, limit, Address::times_1));
7202   negptr(limit);
7203 
7204   bind(COMPARE_VECTORS);
7205   movl(chr, Address(ary1, limit, Address::times_1));
7206   cmpl(chr, Address(ary2, limit, Address::times_1));
7207   jccb(Assembler::notEqual, FALSE_LABEL);
7208   addptr(limit, 4);
7209   jcc(Assembler::notZero, COMPARE_VECTORS);
7210 
7211   // Compare trailing char (final 2 bytes), if any
7212   bind(COMPARE_CHAR);
7213   testl(result, 0x2);   // tail  char
7214   jccb(Assembler::zero, COMPARE_BYTE);
7215   load_unsigned_short(chr, Address(ary1, 0));
7216   load_unsigned_short(limit, Address(ary2, 0));
7217   cmpl(chr, limit);
7218   jccb(Assembler::notEqual, FALSE_LABEL);
7219 
7220   if (is_array_equ && is_char) {
7221     bind(COMPARE_BYTE);
7222   } else {
7223     lea(ary1, Address(ary1, 2));
7224     lea(ary2, Address(ary2, 2));
7225 
7226     bind(COMPARE_BYTE);
7227     testl(result, 0x1);   // tail  byte
7228     jccb(Assembler::zero, TRUE_LABEL);
7229     load_unsigned_byte(chr, Address(ary1, 0));
7230     load_unsigned_byte(limit, Address(ary2, 0));
7231     cmpl(chr, limit);
7232     jccb(Assembler::notEqual, FALSE_LABEL);
7233   }
7234   bind(TRUE_LABEL);
7235   movl(result, 1);   // return true
7236   jmpb(DONE);
7237 
7238   bind(FALSE_LABEL);
7239   xorl(result, result); // return false
7240 
7241   // That's it
7242   bind(DONE);
7243   if (UseAVX >= 2) {
7244     // clean upper bits of YMM registers
7245     vpxor(vec1, vec1);
7246     vpxor(vec2, vec2);
7247   }
7248 }
7249 
7250 #endif
7251 
7252 void MacroAssembler::generate_fill(BasicType t, bool aligned,
7253                                    Register to, Register value, Register count,
7254                                    Register rtmp, XMMRegister xtmp) {
7255   ShortBranchVerifier sbv(this);
7256   assert_different_registers(to, value, count, rtmp);
7257   Label L_exit;
7258   Label L_fill_2_bytes, L_fill_4_bytes;
7259 
7260   int shift = -1;
7261   switch (t) {
7262     case T_BYTE:
7263       shift = 2;
7264       break;
7265     case T_SHORT:
7266       shift = 1;
7267       break;
7268     case T_INT:
7269       shift = 0;
7270       break;
7271     default: ShouldNotReachHere();
7272   }
7273 
7274   if (t == T_BYTE) {
7275     andl(value, 0xff);
7276     movl(rtmp, value);
7277     shll(rtmp, 8);
7278     orl(value, rtmp);
7279   }
7280   if (t == T_SHORT) {
7281     andl(value, 0xffff);
7282   }
7283   if (t == T_BYTE || t == T_SHORT) {
7284     movl(rtmp, value);
7285     shll(rtmp, 16);
7286     orl(value, rtmp);
7287   }
7288 
7289   cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
7290   jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
7291   if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
7292     Label L_skip_align2;
7293     // align source address at 4 bytes address boundary
7294     if (t == T_BYTE) {
7295       Label L_skip_align1;
7296       // One byte misalignment happens only for byte arrays
7297       testptr(to, 1);
7298       jccb(Assembler::zero, L_skip_align1);
7299       movb(Address(to, 0), value);
7300       increment(to);
7301       decrement(count);
7302       BIND(L_skip_align1);
7303     }
7304     // Two bytes misalignment happens only for byte and short (char) arrays
7305     testptr(to, 2);
7306     jccb(Assembler::zero, L_skip_align2);
7307     movw(Address(to, 0), value);
7308     addptr(to, 2);
7309     subl(count, 1<<(shift-1));
7310     BIND(L_skip_align2);
7311   }
7312   if (UseSSE < 2) {
7313     Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
7314     // Fill 32-byte chunks
7315     subl(count, 8 << shift);
7316     jcc(Assembler::less, L_check_fill_8_bytes);
7317     align(16);
7318 
7319     BIND(L_fill_32_bytes_loop);
7320 
7321     for (int i = 0; i < 32; i += 4) {
7322       movl(Address(to, i), value);
7323     }
7324 
7325     addptr(to, 32);
7326     subl(count, 8 << shift);
7327     jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
7328     BIND(L_check_fill_8_bytes);
7329     addl(count, 8 << shift);
7330     jccb(Assembler::zero, L_exit);
7331     jmpb(L_fill_8_bytes);
7332 
7333     //
7334     // length is too short, just fill qwords
7335     //
7336     BIND(L_fill_8_bytes_loop);
7337     movl(Address(to, 0), value);
7338     movl(Address(to, 4), value);
7339     addptr(to, 8);
7340     BIND(L_fill_8_bytes);
7341     subl(count, 1 << (shift + 1));
7342     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7343     // fall through to fill 4 bytes
7344   } else {
7345     Label L_fill_32_bytes;
7346     if (!UseUnalignedLoadStores) {
7347       // align to 8 bytes, we know we are 4 byte aligned to start
7348       testptr(to, 4);
7349       jccb(Assembler::zero, L_fill_32_bytes);
7350       movl(Address(to, 0), value);
7351       addptr(to, 4);
7352       subl(count, 1<<shift);
7353     }
7354     BIND(L_fill_32_bytes);
7355     {
7356       assert( UseSSE >= 2, "supported cpu only" );
7357       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
7358       movdl(xtmp, value);
7359       if (UseAVX > 2 && UseUnalignedLoadStores) {
7360         // Fill 64-byte chunks
7361         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7362         vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
7363 
7364         subl(count, 16 << shift);
7365         jcc(Assembler::less, L_check_fill_32_bytes);
7366         align(16);
7367 
7368         BIND(L_fill_64_bytes_loop);
7369         evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
7370         addptr(to, 64);
7371         subl(count, 16 << shift);
7372         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7373 
7374         BIND(L_check_fill_32_bytes);
7375         addl(count, 8 << shift);
7376         jccb(Assembler::less, L_check_fill_8_bytes);
7377         vmovdqu(Address(to, 0), xtmp);
7378         addptr(to, 32);
7379         subl(count, 8 << shift);
7380 
7381         BIND(L_check_fill_8_bytes);
7382       } else if (UseAVX == 2 && UseUnalignedLoadStores) {
7383         // Fill 64-byte chunks
7384         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7385         vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
7386 
7387         subl(count, 16 << shift);
7388         jcc(Assembler::less, L_check_fill_32_bytes);
7389         align(16);
7390 
7391         BIND(L_fill_64_bytes_loop);
7392         vmovdqu(Address(to, 0), xtmp);
7393         vmovdqu(Address(to, 32), xtmp);
7394         addptr(to, 64);
7395         subl(count, 16 << shift);
7396         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7397 
7398         BIND(L_check_fill_32_bytes);
7399         addl(count, 8 << shift);
7400         jccb(Assembler::less, L_check_fill_8_bytes);
7401         vmovdqu(Address(to, 0), xtmp);
7402         addptr(to, 32);
7403         subl(count, 8 << shift);
7404 
7405         BIND(L_check_fill_8_bytes);
7406         // clean upper bits of YMM registers
7407         movdl(xtmp, value);
7408         pshufd(xtmp, xtmp, 0);
7409       } else {
7410         // Fill 32-byte chunks
7411         pshufd(xtmp, xtmp, 0);
7412 
7413         subl(count, 8 << shift);
7414         jcc(Assembler::less, L_check_fill_8_bytes);
7415         align(16);
7416 
7417         BIND(L_fill_32_bytes_loop);
7418 
7419         if (UseUnalignedLoadStores) {
7420           movdqu(Address(to, 0), xtmp);
7421           movdqu(Address(to, 16), xtmp);
7422         } else {
7423           movq(Address(to, 0), xtmp);
7424           movq(Address(to, 8), xtmp);
7425           movq(Address(to, 16), xtmp);
7426           movq(Address(to, 24), xtmp);
7427         }
7428 
7429         addptr(to, 32);
7430         subl(count, 8 << shift);
7431         jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
7432 
7433         BIND(L_check_fill_8_bytes);
7434       }
7435       addl(count, 8 << shift);
7436       jccb(Assembler::zero, L_exit);
7437       jmpb(L_fill_8_bytes);
7438 
7439       //
7440       // length is too short, just fill qwords
7441       //
7442       BIND(L_fill_8_bytes_loop);
7443       movq(Address(to, 0), xtmp);
7444       addptr(to, 8);
7445       BIND(L_fill_8_bytes);
7446       subl(count, 1 << (shift + 1));
7447       jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7448     }
7449   }
7450   // fill trailing 4 bytes
7451   BIND(L_fill_4_bytes);
7452   testl(count, 1<<shift);
7453   jccb(Assembler::zero, L_fill_2_bytes);
7454   movl(Address(to, 0), value);
7455   if (t == T_BYTE || t == T_SHORT) {
7456     Label L_fill_byte;
7457     addptr(to, 4);
7458     BIND(L_fill_2_bytes);
7459     // fill trailing 2 bytes
7460     testl(count, 1<<(shift-1));
7461     jccb(Assembler::zero, L_fill_byte);
7462     movw(Address(to, 0), value);
7463     if (t == T_BYTE) {
7464       addptr(to, 2);
7465       BIND(L_fill_byte);
7466       // fill trailing byte
7467       testl(count, 1);
7468       jccb(Assembler::zero, L_exit);
7469       movb(Address(to, 0), value);
7470     } else {
7471       BIND(L_fill_byte);
7472     }
7473   } else {
7474     BIND(L_fill_2_bytes);
7475   }
7476   BIND(L_exit);
7477 }
7478 
7479 // encode char[] to byte[] in ISO_8859_1
7480    //@HotSpotIntrinsicCandidate
7481    //private static int implEncodeISOArray(byte[] sa, int sp,
7482    //byte[] da, int dp, int len) {
7483    //  int i = 0;
7484    //  for (; i < len; i++) {
7485    //    char c = StringUTF16.getChar(sa, sp++);
7486    //    if (c > '\u00FF')
7487    //      break;
7488    //    da[dp++] = (byte)c;
7489    //  }
7490    //  return i;
7491    //}
7492 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
7493   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
7494   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
7495   Register tmp5, Register result) {
7496 
7497   // rsi: src
7498   // rdi: dst
7499   // rdx: len
7500   // rcx: tmp5
7501   // rax: result
7502   ShortBranchVerifier sbv(this);
7503   assert_different_registers(src, dst, len, tmp5, result);
7504   Label L_done, L_copy_1_char, L_copy_1_char_exit;
7505 
7506   // set result
7507   xorl(result, result);
7508   // check for zero length
7509   testl(len, len);
7510   jcc(Assembler::zero, L_done);
7511 
7512   movl(result, len);
7513 
7514   // Setup pointers
7515   lea(src, Address(src, len, Address::times_2)); // char[]
7516   lea(dst, Address(dst, len, Address::times_1)); // byte[]
7517   negptr(len);
7518 
7519   if (UseSSE42Intrinsics || UseAVX >= 2) {
7520     Label L_copy_8_chars, L_copy_8_chars_exit;
7521     Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
7522 
7523     if (UseAVX >= 2) {
7524       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
7525       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
7526       movdl(tmp1Reg, tmp5);
7527       vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
7528       jmp(L_chars_32_check);
7529 
7530       bind(L_copy_32_chars);
7531       vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
7532       vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
7533       vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
7534       vptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
7535       jccb(Assembler::notZero, L_copy_32_chars_exit);
7536       vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
7537       vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
7538       vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
7539 
7540       bind(L_chars_32_check);
7541       addptr(len, 32);
7542       jcc(Assembler::lessEqual, L_copy_32_chars);
7543 
7544       bind(L_copy_32_chars_exit);
7545       subptr(len, 16);
7546       jccb(Assembler::greater, L_copy_16_chars_exit);
7547 
7548     } else if (UseSSE42Intrinsics) {
7549       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
7550       movdl(tmp1Reg, tmp5);
7551       pshufd(tmp1Reg, tmp1Reg, 0);
7552       jmpb(L_chars_16_check);
7553     }
7554 
7555     bind(L_copy_16_chars);
7556     if (UseAVX >= 2) {
7557       vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
7558       vptest(tmp2Reg, tmp1Reg);
7559       jcc(Assembler::notZero, L_copy_16_chars_exit);
7560       vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
7561       vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
7562     } else {
7563       if (UseAVX > 0) {
7564         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7565         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7566         vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
7567       } else {
7568         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7569         por(tmp2Reg, tmp3Reg);
7570         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7571         por(tmp2Reg, tmp4Reg);
7572       }
7573       ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
7574       jccb(Assembler::notZero, L_copy_16_chars_exit);
7575       packuswb(tmp3Reg, tmp4Reg);
7576     }
7577     movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
7578 
7579     bind(L_chars_16_check);
7580     addptr(len, 16);
7581     jcc(Assembler::lessEqual, L_copy_16_chars);
7582 
7583     bind(L_copy_16_chars_exit);
7584     if (UseAVX >= 2) {
7585       // clean upper bits of YMM registers
7586       vpxor(tmp2Reg, tmp2Reg);
7587       vpxor(tmp3Reg, tmp3Reg);
7588       vpxor(tmp4Reg, tmp4Reg);
7589       movdl(tmp1Reg, tmp5);
7590       pshufd(tmp1Reg, tmp1Reg, 0);
7591     }
7592     subptr(len, 8);
7593     jccb(Assembler::greater, L_copy_8_chars_exit);
7594 
7595     bind(L_copy_8_chars);
7596     movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
7597     ptest(tmp3Reg, tmp1Reg);
7598     jccb(Assembler::notZero, L_copy_8_chars_exit);
7599     packuswb(tmp3Reg, tmp1Reg);
7600     movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
7601     addptr(len, 8);
7602     jccb(Assembler::lessEqual, L_copy_8_chars);
7603 
7604     bind(L_copy_8_chars_exit);
7605     subptr(len, 8);
7606     jccb(Assembler::zero, L_done);
7607   }
7608 
7609   bind(L_copy_1_char);
7610   load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
7611   testl(tmp5, 0xff00);      // check if Unicode char
7612   jccb(Assembler::notZero, L_copy_1_char_exit);
7613   movb(Address(dst, len, Address::times_1, 0), tmp5);
7614   addptr(len, 1);
7615   jccb(Assembler::less, L_copy_1_char);
7616 
7617   bind(L_copy_1_char_exit);
7618   addptr(result, len); // len is negative count of not processed elements
7619 
7620   bind(L_done);
7621 }
7622 
7623 #ifdef _LP64
7624 /**
7625  * Helper for multiply_to_len().
7626  */
7627 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
7628   addq(dest_lo, src1);
7629   adcq(dest_hi, 0);
7630   addq(dest_lo, src2);
7631   adcq(dest_hi, 0);
7632 }
7633 
7634 /**
7635  * Multiply 64 bit by 64 bit first loop.
7636  */
7637 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
7638                                            Register y, Register y_idx, Register z,
7639                                            Register carry, Register product,
7640                                            Register idx, Register kdx) {
7641   //
7642   //  jlong carry, x[], y[], z[];
7643   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
7644   //    huge_128 product = y[idx] * x[xstart] + carry;
7645   //    z[kdx] = (jlong)product;
7646   //    carry  = (jlong)(product >>> 64);
7647   //  }
7648   //  z[xstart] = carry;
7649   //
7650 
7651   Label L_first_loop, L_first_loop_exit;
7652   Label L_one_x, L_one_y, L_multiply;
7653 
7654   decrementl(xstart);
7655   jcc(Assembler::negative, L_one_x);
7656 
7657   movq(x_xstart, Address(x, xstart, Address::times_4,  0));
7658   rorq(x_xstart, 32); // convert big-endian to little-endian
7659 
7660   bind(L_first_loop);
7661   decrementl(idx);
7662   jcc(Assembler::negative, L_first_loop_exit);
7663   decrementl(idx);
7664   jcc(Assembler::negative, L_one_y);
7665   movq(y_idx, Address(y, idx, Address::times_4,  0));
7666   rorq(y_idx, 32); // convert big-endian to little-endian
7667   bind(L_multiply);
7668   movq(product, x_xstart);
7669   mulq(y_idx); // product(rax) * y_idx -> rdx:rax
7670   addq(product, carry);
7671   adcq(rdx, 0);
7672   subl(kdx, 2);
7673   movl(Address(z, kdx, Address::times_4,  4), product);
7674   shrq(product, 32);
7675   movl(Address(z, kdx, Address::times_4,  0), product);
7676   movq(carry, rdx);
7677   jmp(L_first_loop);
7678 
7679   bind(L_one_y);
7680   movl(y_idx, Address(y,  0));
7681   jmp(L_multiply);
7682 
7683   bind(L_one_x);
7684   movl(x_xstart, Address(x,  0));
7685   jmp(L_first_loop);
7686 
7687   bind(L_first_loop_exit);
7688 }
7689 
7690 /**
7691  * Multiply 64 bit by 64 bit and add 128 bit.
7692  */
7693 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
7694                                             Register yz_idx, Register idx,
7695                                             Register carry, Register product, int offset) {
7696   //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
7697   //     z[kdx] = (jlong)product;
7698 
7699   movq(yz_idx, Address(y, idx, Address::times_4,  offset));
7700   rorq(yz_idx, 32); // convert big-endian to little-endian
7701   movq(product, x_xstart);
7702   mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
7703   movq(yz_idx, Address(z, idx, Address::times_4,  offset));
7704   rorq(yz_idx, 32); // convert big-endian to little-endian
7705 
7706   add2_with_carry(rdx, product, carry, yz_idx);
7707 
7708   movl(Address(z, idx, Address::times_4,  offset+4), product);
7709   shrq(product, 32);
7710   movl(Address(z, idx, Address::times_4,  offset), product);
7711 
7712 }
7713 
7714 /**
7715  * Multiply 128 bit by 128 bit. Unrolled inner loop.
7716  */
7717 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
7718                                              Register yz_idx, Register idx, Register jdx,
7719                                              Register carry, Register product,
7720                                              Register carry2) {
7721   //   jlong carry, x[], y[], z[];
7722   //   int kdx = ystart+1;
7723   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
7724   //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
7725   //     z[kdx+idx+1] = (jlong)product;
7726   //     jlong carry2  = (jlong)(product >>> 64);
7727   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
7728   //     z[kdx+idx] = (jlong)product;
7729   //     carry  = (jlong)(product >>> 64);
7730   //   }
7731   //   idx += 2;
7732   //   if (idx > 0) {
7733   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
7734   //     z[kdx+idx] = (jlong)product;
7735   //     carry  = (jlong)(product >>> 64);
7736   //   }
7737   //
7738 
7739   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
7740 
7741   movl(jdx, idx);
7742   andl(jdx, 0xFFFFFFFC);
7743   shrl(jdx, 2);
7744 
7745   bind(L_third_loop);
7746   subl(jdx, 1);
7747   jcc(Assembler::negative, L_third_loop_exit);
7748   subl(idx, 4);
7749 
7750   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
7751   movq(carry2, rdx);
7752 
7753   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
7754   movq(carry, rdx);
7755   jmp(L_third_loop);
7756 
7757   bind (L_third_loop_exit);
7758 
7759   andl (idx, 0x3);
7760   jcc(Assembler::zero, L_post_third_loop_done);
7761 
7762   Label L_check_1;
7763   subl(idx, 2);
7764   jcc(Assembler::negative, L_check_1);
7765 
7766   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
7767   movq(carry, rdx);
7768 
7769   bind (L_check_1);
7770   addl (idx, 0x2);
7771   andl (idx, 0x1);
7772   subl(idx, 1);
7773   jcc(Assembler::negative, L_post_third_loop_done);
7774 
7775   movl(yz_idx, Address(y, idx, Address::times_4,  0));
7776   movq(product, x_xstart);
7777   mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
7778   movl(yz_idx, Address(z, idx, Address::times_4,  0));
7779 
7780   add2_with_carry(rdx, product, yz_idx, carry);
7781 
7782   movl(Address(z, idx, Address::times_4,  0), product);
7783   shrq(product, 32);
7784 
7785   shlq(rdx, 32);
7786   orq(product, rdx);
7787   movq(carry, product);
7788 
7789   bind(L_post_third_loop_done);
7790 }
7791 
7792 /**
7793  * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
7794  *
7795  */
7796 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
7797                                                   Register carry, Register carry2,
7798                                                   Register idx, Register jdx,
7799                                                   Register yz_idx1, Register yz_idx2,
7800                                                   Register tmp, Register tmp3, Register tmp4) {
7801   assert(UseBMI2Instructions, "should be used only when BMI2 is available");
7802 
7803   //   jlong carry, x[], y[], z[];
7804   //   int kdx = ystart+1;
7805   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
7806   //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
7807   //     jlong carry2  = (jlong)(tmp3 >>> 64);
7808   //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
7809   //     carry  = (jlong)(tmp4 >>> 64);
7810   //     z[kdx+idx+1] = (jlong)tmp3;
7811   //     z[kdx+idx] = (jlong)tmp4;
7812   //   }
7813   //   idx += 2;
7814   //   if (idx > 0) {
7815   //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
7816   //     z[kdx+idx] = (jlong)yz_idx1;
7817   //     carry  = (jlong)(yz_idx1 >>> 64);
7818   //   }
7819   //
7820 
7821   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
7822 
7823   movl(jdx, idx);
7824   andl(jdx, 0xFFFFFFFC);
7825   shrl(jdx, 2);
7826 
7827   bind(L_third_loop);
7828   subl(jdx, 1);
7829   jcc(Assembler::negative, L_third_loop_exit);
7830   subl(idx, 4);
7831 
7832   movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
7833   rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
7834   movq(yz_idx2, Address(y, idx, Address::times_4,  0));
7835   rorxq(yz_idx2, yz_idx2, 32);
7836 
7837   mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
7838   mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
7839 
7840   movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
7841   rorxq(yz_idx1, yz_idx1, 32);
7842   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
7843   rorxq(yz_idx2, yz_idx2, 32);
7844 
7845   if (VM_Version::supports_adx()) {
7846     adcxq(tmp3, carry);
7847     adoxq(tmp3, yz_idx1);
7848 
7849     adcxq(tmp4, tmp);
7850     adoxq(tmp4, yz_idx2);
7851 
7852     movl(carry, 0); // does not affect flags
7853     adcxq(carry2, carry);
7854     adoxq(carry2, carry);
7855   } else {
7856     add2_with_carry(tmp4, tmp3, carry, yz_idx1);
7857     add2_with_carry(carry2, tmp4, tmp, yz_idx2);
7858   }
7859   movq(carry, carry2);
7860 
7861   movl(Address(z, idx, Address::times_4, 12), tmp3);
7862   shrq(tmp3, 32);
7863   movl(Address(z, idx, Address::times_4,  8), tmp3);
7864 
7865   movl(Address(z, idx, Address::times_4,  4), tmp4);
7866   shrq(tmp4, 32);
7867   movl(Address(z, idx, Address::times_4,  0), tmp4);
7868 
7869   jmp(L_third_loop);
7870 
7871   bind (L_third_loop_exit);
7872 
7873   andl (idx, 0x3);
7874   jcc(Assembler::zero, L_post_third_loop_done);
7875 
7876   Label L_check_1;
7877   subl(idx, 2);
7878   jcc(Assembler::negative, L_check_1);
7879 
7880   movq(yz_idx1, Address(y, idx, Address::times_4,  0));
7881   rorxq(yz_idx1, yz_idx1, 32);
7882   mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
7883   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
7884   rorxq(yz_idx2, yz_idx2, 32);
7885 
7886   add2_with_carry(tmp4, tmp3, carry, yz_idx2);
7887 
7888   movl(Address(z, idx, Address::times_4,  4), tmp3);
7889   shrq(tmp3, 32);
7890   movl(Address(z, idx, Address::times_4,  0), tmp3);
7891   movq(carry, tmp4);
7892 
7893   bind (L_check_1);
7894   addl (idx, 0x2);
7895   andl (idx, 0x1);
7896   subl(idx, 1);
7897   jcc(Assembler::negative, L_post_third_loop_done);
7898   movl(tmp4, Address(y, idx, Address::times_4,  0));
7899   mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
7900   movl(tmp4, Address(z, idx, Address::times_4,  0));
7901 
7902   add2_with_carry(carry2, tmp3, tmp4, carry);
7903 
7904   movl(Address(z, idx, Address::times_4,  0), tmp3);
7905   shrq(tmp3, 32);
7906 
7907   shlq(carry2, 32);
7908   orq(tmp3, carry2);
7909   movq(carry, tmp3);
7910 
7911   bind(L_post_third_loop_done);
7912 }
7913 
7914 /**
7915  * Code for BigInteger::multiplyToLen() instrinsic.
7916  *
7917  * rdi: x
7918  * rax: xlen
7919  * rsi: y
7920  * rcx: ylen
7921  * r8:  z
7922  * r11: zlen
7923  * r12: tmp1
7924  * r13: tmp2
7925  * r14: tmp3
7926  * r15: tmp4
7927  * rbx: tmp5
7928  *
7929  */
7930 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
7931                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
7932   ShortBranchVerifier sbv(this);
7933   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
7934 
7935   push(tmp1);
7936   push(tmp2);
7937   push(tmp3);
7938   push(tmp4);
7939   push(tmp5);
7940 
7941   push(xlen);
7942   push(zlen);
7943 
7944   const Register idx = tmp1;
7945   const Register kdx = tmp2;
7946   const Register xstart = tmp3;
7947 
7948   const Register y_idx = tmp4;
7949   const Register carry = tmp5;
7950   const Register product  = xlen;
7951   const Register x_xstart = zlen;  // reuse register
7952 
7953   // First Loop.
7954   //
7955   //  final static long LONG_MASK = 0xffffffffL;
7956   //  int xstart = xlen - 1;
7957   //  int ystart = ylen - 1;
7958   //  long carry = 0;
7959   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
7960   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
7961   //    z[kdx] = (int)product;
7962   //    carry = product >>> 32;
7963   //  }
7964   //  z[xstart] = (int)carry;
7965   //
7966 
7967   movl(idx, ylen);      // idx = ylen;
7968   movl(kdx, zlen);      // kdx = xlen+ylen;
7969   xorq(carry, carry);   // carry = 0;
7970 
7971   Label L_done;
7972 
7973   movl(xstart, xlen);
7974   decrementl(xstart);
7975   jcc(Assembler::negative, L_done);
7976 
7977   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
7978 
7979   Label L_second_loop;
7980   testl(kdx, kdx);
7981   jcc(Assembler::zero, L_second_loop);
7982 
7983   Label L_carry;
7984   subl(kdx, 1);
7985   jcc(Assembler::zero, L_carry);
7986 
7987   movl(Address(z, kdx, Address::times_4,  0), carry);
7988   shrq(carry, 32);
7989   subl(kdx, 1);
7990 
7991   bind(L_carry);
7992   movl(Address(z, kdx, Address::times_4,  0), carry);
7993 
7994   // Second and third (nested) loops.
7995   //
7996   // for (int i = xstart-1; i >= 0; i--) { // Second loop
7997   //   carry = 0;
7998   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
7999   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
8000   //                    (z[k] & LONG_MASK) + carry;
8001   //     z[k] = (int)product;
8002   //     carry = product >>> 32;
8003   //   }
8004   //   z[i] = (int)carry;
8005   // }
8006   //
8007   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
8008 
8009   const Register jdx = tmp1;
8010 
8011   bind(L_second_loop);
8012   xorl(carry, carry);    // carry = 0;
8013   movl(jdx, ylen);       // j = ystart+1
8014 
8015   subl(xstart, 1);       // i = xstart-1;
8016   jcc(Assembler::negative, L_done);
8017 
8018   push (z);
8019 
8020   Label L_last_x;
8021   lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
8022   subl(xstart, 1);       // i = xstart-1;
8023   jcc(Assembler::negative, L_last_x);
8024 
8025   if (UseBMI2Instructions) {
8026     movq(rdx,  Address(x, xstart, Address::times_4,  0));
8027     rorxq(rdx, rdx, 32); // convert big-endian to little-endian
8028   } else {
8029     movq(x_xstart, Address(x, xstart, Address::times_4,  0));
8030     rorq(x_xstart, 32);  // convert big-endian to little-endian
8031   }
8032 
8033   Label L_third_loop_prologue;
8034   bind(L_third_loop_prologue);
8035 
8036   push (x);
8037   push (xstart);
8038   push (ylen);
8039 
8040 
8041   if (UseBMI2Instructions) {
8042     multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
8043   } else { // !UseBMI2Instructions
8044     multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
8045   }
8046 
8047   pop(ylen);
8048   pop(xlen);
8049   pop(x);
8050   pop(z);
8051 
8052   movl(tmp3, xlen);
8053   addl(tmp3, 1);
8054   movl(Address(z, tmp3, Address::times_4,  0), carry);
8055   subl(tmp3, 1);
8056   jccb(Assembler::negative, L_done);
8057 
8058   shrq(carry, 32);
8059   movl(Address(z, tmp3, Address::times_4,  0), carry);
8060   jmp(L_second_loop);
8061 
8062   // Next infrequent code is moved outside loops.
8063   bind(L_last_x);
8064   if (UseBMI2Instructions) {
8065     movl(rdx, Address(x,  0));
8066   } else {
8067     movl(x_xstart, Address(x,  0));
8068   }
8069   jmp(L_third_loop_prologue);
8070 
8071   bind(L_done);
8072 
8073   pop(zlen);
8074   pop(xlen);
8075 
8076   pop(tmp5);
8077   pop(tmp4);
8078   pop(tmp3);
8079   pop(tmp2);
8080   pop(tmp1);
8081 }
8082 
8083 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
8084   Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
8085   assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
8086   Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
8087   Label VECTOR8_TAIL, VECTOR4_TAIL;
8088   Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
8089   Label SAME_TILL_END, DONE;
8090   Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
8091 
8092   //scale is in rcx in both Win64 and Unix
8093   ShortBranchVerifier sbv(this);
8094 
8095   shlq(length);
8096   xorq(result, result);
8097 
8098   if ((UseAVX > 2) &&
8099       VM_Version::supports_avx512vlbw()) {
8100     Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
8101 
8102     cmpq(length, 64);
8103     jcc(Assembler::less, VECTOR32_TAIL);
8104     movq(tmp1, length);
8105     andq(tmp1, 0x3F);      // tail count
8106     andq(length, ~(0x3F)); //vector count
8107 
8108     bind(VECTOR64_LOOP);
8109     // AVX512 code to compare 64 byte vectors.
8110     evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
8111     evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
8112     kortestql(k7, k7);
8113     jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL);     // mismatch
8114     addq(result, 64);
8115     subq(length, 64);
8116     jccb(Assembler::notZero, VECTOR64_LOOP);
8117 
8118     //bind(VECTOR64_TAIL);
8119     testq(tmp1, tmp1);
8120     jcc(Assembler::zero, SAME_TILL_END);
8121 
8122     //bind(VECTOR64_TAIL);
8123     // AVX512 code to compare upto 63 byte vectors.
8124     mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
8125     shlxq(tmp2, tmp2, tmp1);
8126     notq(tmp2);
8127     kmovql(k3, tmp2);
8128 
8129     evmovdqub(rymm0, k3, Address(obja, result), Assembler::AVX_512bit);
8130     evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
8131 
8132     ktestql(k7, k3);
8133     jcc(Assembler::below, SAME_TILL_END);     // not mismatch
8134 
8135     bind(VECTOR64_NOT_EQUAL);
8136     kmovql(tmp1, k7);
8137     notq(tmp1);
8138     tzcntq(tmp1, tmp1);
8139     addq(result, tmp1);
8140     shrq(result);
8141     jmp(DONE);
8142     bind(VECTOR32_TAIL);
8143   }
8144 
8145   cmpq(length, 8);
8146   jcc(Assembler::equal, VECTOR8_LOOP);
8147   jcc(Assembler::less, VECTOR4_TAIL);
8148 
8149   if (UseAVX >= 2) {
8150     Label VECTOR16_TAIL, VECTOR32_LOOP;
8151 
8152     cmpq(length, 16);
8153     jcc(Assembler::equal, VECTOR16_LOOP);
8154     jcc(Assembler::less, VECTOR8_LOOP);
8155 
8156     cmpq(length, 32);
8157     jccb(Assembler::less, VECTOR16_TAIL);
8158 
8159     subq(length, 32);
8160     bind(VECTOR32_LOOP);
8161     vmovdqu(rymm0, Address(obja, result));
8162     vmovdqu(rymm1, Address(objb, result));
8163     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
8164     vptest(rymm2, rymm2);
8165     jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
8166     addq(result, 32);
8167     subq(length, 32);
8168     jcc(Assembler::greaterEqual, VECTOR32_LOOP);
8169     addq(length, 32);
8170     jcc(Assembler::equal, SAME_TILL_END);
8171     //falling through if less than 32 bytes left //close the branch here.
8172 
8173     bind(VECTOR16_TAIL);
8174     cmpq(length, 16);
8175     jccb(Assembler::less, VECTOR8_TAIL);
8176     bind(VECTOR16_LOOP);
8177     movdqu(rymm0, Address(obja, result));
8178     movdqu(rymm1, Address(objb, result));
8179     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
8180     ptest(rymm2, rymm2);
8181     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
8182     addq(result, 16);
8183     subq(length, 16);
8184     jcc(Assembler::equal, SAME_TILL_END);
8185     //falling through if less than 16 bytes left
8186   } else {//regular intrinsics
8187 
8188     cmpq(length, 16);
8189     jccb(Assembler::less, VECTOR8_TAIL);
8190 
8191     subq(length, 16);
8192     bind(VECTOR16_LOOP);
8193     movdqu(rymm0, Address(obja, result));
8194     movdqu(rymm1, Address(objb, result));
8195     pxor(rymm0, rymm1);
8196     ptest(rymm0, rymm0);
8197     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
8198     addq(result, 16);
8199     subq(length, 16);
8200     jccb(Assembler::greaterEqual, VECTOR16_LOOP);
8201     addq(length, 16);
8202     jcc(Assembler::equal, SAME_TILL_END);
8203     //falling through if less than 16 bytes left
8204   }
8205 
8206   bind(VECTOR8_TAIL);
8207   cmpq(length, 8);
8208   jccb(Assembler::less, VECTOR4_TAIL);
8209   bind(VECTOR8_LOOP);
8210   movq(tmp1, Address(obja, result));
8211   movq(tmp2, Address(objb, result));
8212   xorq(tmp1, tmp2);
8213   testq(tmp1, tmp1);
8214   jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
8215   addq(result, 8);
8216   subq(length, 8);
8217   jcc(Assembler::equal, SAME_TILL_END);
8218   //falling through if less than 8 bytes left
8219 
8220   bind(VECTOR4_TAIL);
8221   cmpq(length, 4);
8222   jccb(Assembler::less, BYTES_TAIL);
8223   bind(VECTOR4_LOOP);
8224   movl(tmp1, Address(obja, result));
8225   xorl(tmp1, Address(objb, result));
8226   testl(tmp1, tmp1);
8227   jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
8228   addq(result, 4);
8229   subq(length, 4);
8230   jcc(Assembler::equal, SAME_TILL_END);
8231   //falling through if less than 4 bytes left
8232 
8233   bind(BYTES_TAIL);
8234   bind(BYTES_LOOP);
8235   load_unsigned_byte(tmp1, Address(obja, result));
8236   load_unsigned_byte(tmp2, Address(objb, result));
8237   xorl(tmp1, tmp2);
8238   testl(tmp1, tmp1);
8239   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
8240   decq(length);
8241   jcc(Assembler::zero, SAME_TILL_END);
8242   incq(result);
8243   load_unsigned_byte(tmp1, Address(obja, result));
8244   load_unsigned_byte(tmp2, Address(objb, result));
8245   xorl(tmp1, tmp2);
8246   testl(tmp1, tmp1);
8247   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
8248   decq(length);
8249   jcc(Assembler::zero, SAME_TILL_END);
8250   incq(result);
8251   load_unsigned_byte(tmp1, Address(obja, result));
8252   load_unsigned_byte(tmp2, Address(objb, result));
8253   xorl(tmp1, tmp2);
8254   testl(tmp1, tmp1);
8255   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
8256   jmp(SAME_TILL_END);
8257 
8258   if (UseAVX >= 2) {
8259     bind(VECTOR32_NOT_EQUAL);
8260     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
8261     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
8262     vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
8263     vpmovmskb(tmp1, rymm0);
8264     bsfq(tmp1, tmp1);
8265     addq(result, tmp1);
8266     shrq(result);
8267     jmp(DONE);
8268   }
8269 
8270   bind(VECTOR16_NOT_EQUAL);
8271   if (UseAVX >= 2) {
8272     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
8273     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
8274     pxor(rymm0, rymm2);
8275   } else {
8276     pcmpeqb(rymm2, rymm2);
8277     pxor(rymm0, rymm1);
8278     pcmpeqb(rymm0, rymm1);
8279     pxor(rymm0, rymm2);
8280   }
8281   pmovmskb(tmp1, rymm0);
8282   bsfq(tmp1, tmp1);
8283   addq(result, tmp1);
8284   shrq(result);
8285   jmpb(DONE);
8286 
8287   bind(VECTOR8_NOT_EQUAL);
8288   bind(VECTOR4_NOT_EQUAL);
8289   bsfq(tmp1, tmp1);
8290   shrq(tmp1, 3);
8291   addq(result, tmp1);
8292   bind(BYTES_NOT_EQUAL);
8293   shrq(result);
8294   jmpb(DONE);
8295 
8296   bind(SAME_TILL_END);
8297   mov64(result, -1);
8298 
8299   bind(DONE);
8300 }
8301 
8302 //Helper functions for square_to_len()
8303 
8304 /**
8305  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
8306  * Preserves x and z and modifies rest of the registers.
8307  */
8308 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8309   // Perform square and right shift by 1
8310   // Handle odd xlen case first, then for even xlen do the following
8311   // jlong carry = 0;
8312   // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
8313   //     huge_128 product = x[j:j+1] * x[j:j+1];
8314   //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
8315   //     z[i+2:i+3] = (jlong)(product >>> 1);
8316   //     carry = (jlong)product;
8317   // }
8318 
8319   xorq(tmp5, tmp5);     // carry
8320   xorq(rdxReg, rdxReg);
8321   xorl(tmp1, tmp1);     // index for x
8322   xorl(tmp4, tmp4);     // index for z
8323 
8324   Label L_first_loop, L_first_loop_exit;
8325 
8326   testl(xlen, 1);
8327   jccb(Assembler::zero, L_first_loop); //jump if xlen is even
8328 
8329   // Square and right shift by 1 the odd element using 32 bit multiply
8330   movl(raxReg, Address(x, tmp1, Address::times_4, 0));
8331   imulq(raxReg, raxReg);
8332   shrq(raxReg, 1);
8333   adcq(tmp5, 0);
8334   movq(Address(z, tmp4, Address::times_4, 0), raxReg);
8335   incrementl(tmp1);
8336   addl(tmp4, 2);
8337 
8338   // Square and  right shift by 1 the rest using 64 bit multiply
8339   bind(L_first_loop);
8340   cmpptr(tmp1, xlen);
8341   jccb(Assembler::equal, L_first_loop_exit);
8342 
8343   // Square
8344   movq(raxReg, Address(x, tmp1, Address::times_4,  0));
8345   rorq(raxReg, 32);    // convert big-endian to little-endian
8346   mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
8347 
8348   // Right shift by 1 and save carry
8349   shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
8350   rcrq(rdxReg, 1);
8351   rcrq(raxReg, 1);
8352   adcq(tmp5, 0);
8353 
8354   // Store result in z
8355   movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
8356   movq(Address(z, tmp4, Address::times_4, 8), raxReg);
8357 
8358   // Update indices for x and z
8359   addl(tmp1, 2);
8360   addl(tmp4, 4);
8361   jmp(L_first_loop);
8362 
8363   bind(L_first_loop_exit);
8364 }
8365 
8366 
8367 /**
8368  * Perform the following multiply add operation using BMI2 instructions
8369  * carry:sum = sum + op1*op2 + carry
8370  * op2 should be in rdx
8371  * op2 is preserved, all other registers are modified
8372  */
8373 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
8374   // assert op2 is rdx
8375   mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
8376   addq(sum, carry);
8377   adcq(tmp2, 0);
8378   addq(sum, op1);
8379   adcq(tmp2, 0);
8380   movq(carry, tmp2);
8381 }
8382 
8383 /**
8384  * Perform the following multiply add operation:
8385  * carry:sum = sum + op1*op2 + carry
8386  * Preserves op1, op2 and modifies rest of registers
8387  */
8388 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
8389   // rdx:rax = op1 * op2
8390   movq(raxReg, op2);
8391   mulq(op1);
8392 
8393   //  rdx:rax = sum + carry + rdx:rax
8394   addq(sum, carry);
8395   adcq(rdxReg, 0);
8396   addq(sum, raxReg);
8397   adcq(rdxReg, 0);
8398 
8399   // carry:sum = rdx:sum
8400   movq(carry, rdxReg);
8401 }
8402 
8403 /**
8404  * Add 64 bit long carry into z[] with carry propogation.
8405  * Preserves z and carry register values and modifies rest of registers.
8406  *
8407  */
8408 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
8409   Label L_fourth_loop, L_fourth_loop_exit;
8410 
8411   movl(tmp1, 1);
8412   subl(zlen, 2);
8413   addq(Address(z, zlen, Address::times_4, 0), carry);
8414 
8415   bind(L_fourth_loop);
8416   jccb(Assembler::carryClear, L_fourth_loop_exit);
8417   subl(zlen, 2);
8418   jccb(Assembler::negative, L_fourth_loop_exit);
8419   addq(Address(z, zlen, Address::times_4, 0), tmp1);
8420   jmp(L_fourth_loop);
8421   bind(L_fourth_loop_exit);
8422 }
8423 
8424 /**
8425  * Shift z[] left by 1 bit.
8426  * Preserves x, len, z and zlen registers and modifies rest of the registers.
8427  *
8428  */
8429 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
8430 
8431   Label L_fifth_loop, L_fifth_loop_exit;
8432 
8433   // Fifth loop
8434   // Perform primitiveLeftShift(z, zlen, 1)
8435 
8436   const Register prev_carry = tmp1;
8437   const Register new_carry = tmp4;
8438   const Register value = tmp2;
8439   const Register zidx = tmp3;
8440 
8441   // int zidx, carry;
8442   // long value;
8443   // carry = 0;
8444   // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
8445   //    (carry:value)  = (z[i] << 1) | carry ;
8446   //    z[i] = value;
8447   // }
8448 
8449   movl(zidx, zlen);
8450   xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
8451 
8452   bind(L_fifth_loop);
8453   decl(zidx);  // Use decl to preserve carry flag
8454   decl(zidx);
8455   jccb(Assembler::negative, L_fifth_loop_exit);
8456 
8457   if (UseBMI2Instructions) {
8458      movq(value, Address(z, zidx, Address::times_4, 0));
8459      rclq(value, 1);
8460      rorxq(value, value, 32);
8461      movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
8462   }
8463   else {
8464     // clear new_carry
8465     xorl(new_carry, new_carry);
8466 
8467     // Shift z[i] by 1, or in previous carry and save new carry
8468     movq(value, Address(z, zidx, Address::times_4, 0));
8469     shlq(value, 1);
8470     adcl(new_carry, 0);
8471 
8472     orq(value, prev_carry);
8473     rorq(value, 0x20);
8474     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
8475 
8476     // Set previous carry = new carry
8477     movl(prev_carry, new_carry);
8478   }
8479   jmp(L_fifth_loop);
8480 
8481   bind(L_fifth_loop_exit);
8482 }
8483 
8484 
8485 /**
8486  * Code for BigInteger::squareToLen() intrinsic
8487  *
8488  * rdi: x
8489  * rsi: len
8490  * r8:  z
8491  * rcx: zlen
8492  * r12: tmp1
8493  * r13: tmp2
8494  * r14: tmp3
8495  * r15: tmp4
8496  * rbx: tmp5
8497  *
8498  */
8499 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8500 
8501   Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
8502   push(tmp1);
8503   push(tmp2);
8504   push(tmp3);
8505   push(tmp4);
8506   push(tmp5);
8507 
8508   // First loop
8509   // Store the squares, right shifted one bit (i.e., divided by 2).
8510   square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
8511 
8512   // Add in off-diagonal sums.
8513   //
8514   // Second, third (nested) and fourth loops.
8515   // zlen +=2;
8516   // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
8517   //    carry = 0;
8518   //    long op2 = x[xidx:xidx+1];
8519   //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
8520   //       k -= 2;
8521   //       long op1 = x[j:j+1];
8522   //       long sum = z[k:k+1];
8523   //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
8524   //       z[k:k+1] = sum;
8525   //    }
8526   //    add_one_64(z, k, carry, tmp_regs);
8527   // }
8528 
8529   const Register carry = tmp5;
8530   const Register sum = tmp3;
8531   const Register op1 = tmp4;
8532   Register op2 = tmp2;
8533 
8534   push(zlen);
8535   push(len);
8536   addl(zlen,2);
8537   bind(L_second_loop);
8538   xorq(carry, carry);
8539   subl(zlen, 4);
8540   subl(len, 2);
8541   push(zlen);
8542   push(len);
8543   cmpl(len, 0);
8544   jccb(Assembler::lessEqual, L_second_loop_exit);
8545 
8546   // Multiply an array by one 64 bit long.
8547   if (UseBMI2Instructions) {
8548     op2 = rdxReg;
8549     movq(op2, Address(x, len, Address::times_4,  0));
8550     rorxq(op2, op2, 32);
8551   }
8552   else {
8553     movq(op2, Address(x, len, Address::times_4,  0));
8554     rorq(op2, 32);
8555   }
8556 
8557   bind(L_third_loop);
8558   decrementl(len);
8559   jccb(Assembler::negative, L_third_loop_exit);
8560   decrementl(len);
8561   jccb(Assembler::negative, L_last_x);
8562 
8563   movq(op1, Address(x, len, Address::times_4,  0));
8564   rorq(op1, 32);
8565 
8566   bind(L_multiply);
8567   subl(zlen, 2);
8568   movq(sum, Address(z, zlen, Address::times_4,  0));
8569 
8570   // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
8571   if (UseBMI2Instructions) {
8572     multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
8573   }
8574   else {
8575     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8576   }
8577 
8578   movq(Address(z, zlen, Address::times_4, 0), sum);
8579 
8580   jmp(L_third_loop);
8581   bind(L_third_loop_exit);
8582 
8583   // Fourth loop
8584   // Add 64 bit long carry into z with carry propogation.
8585   // Uses offsetted zlen.
8586   add_one_64(z, zlen, carry, tmp1);
8587 
8588   pop(len);
8589   pop(zlen);
8590   jmp(L_second_loop);
8591 
8592   // Next infrequent code is moved outside loops.
8593   bind(L_last_x);
8594   movl(op1, Address(x, 0));
8595   jmp(L_multiply);
8596 
8597   bind(L_second_loop_exit);
8598   pop(len);
8599   pop(zlen);
8600   pop(len);
8601   pop(zlen);
8602 
8603   // Fifth loop
8604   // Shift z left 1 bit.
8605   lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
8606 
8607   // z[zlen-1] |= x[len-1] & 1;
8608   movl(tmp3, Address(x, len, Address::times_4, -4));
8609   andl(tmp3, 1);
8610   orl(Address(z, zlen, Address::times_4,  -4), tmp3);
8611 
8612   pop(tmp5);
8613   pop(tmp4);
8614   pop(tmp3);
8615   pop(tmp2);
8616   pop(tmp1);
8617 }
8618 
8619 /**
8620  * Helper function for mul_add()
8621  * Multiply the in[] by int k and add to out[] starting at offset offs using
8622  * 128 bit by 32 bit multiply and return the carry in tmp5.
8623  * Only quad int aligned length of in[] is operated on in this function.
8624  * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
8625  * This function preserves out, in and k registers.
8626  * len and offset point to the appropriate index in "in" & "out" correspondingly
8627  * tmp5 has the carry.
8628  * other registers are temporary and are modified.
8629  *
8630  */
8631 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
8632   Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
8633   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8634 
8635   Label L_first_loop, L_first_loop_exit;
8636 
8637   movl(tmp1, len);
8638   shrl(tmp1, 2);
8639 
8640   bind(L_first_loop);
8641   subl(tmp1, 1);
8642   jccb(Assembler::negative, L_first_loop_exit);
8643 
8644   subl(len, 4);
8645   subl(offset, 4);
8646 
8647   Register op2 = tmp2;
8648   const Register sum = tmp3;
8649   const Register op1 = tmp4;
8650   const Register carry = tmp5;
8651 
8652   if (UseBMI2Instructions) {
8653     op2 = rdxReg;
8654   }
8655 
8656   movq(op1, Address(in, len, Address::times_4,  8));
8657   rorq(op1, 32);
8658   movq(sum, Address(out, offset, Address::times_4,  8));
8659   rorq(sum, 32);
8660   if (UseBMI2Instructions) {
8661     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8662   }
8663   else {
8664     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8665   }
8666   // Store back in big endian from little endian
8667   rorq(sum, 0x20);
8668   movq(Address(out, offset, Address::times_4,  8), sum);
8669 
8670   movq(op1, Address(in, len, Address::times_4,  0));
8671   rorq(op1, 32);
8672   movq(sum, Address(out, offset, Address::times_4,  0));
8673   rorq(sum, 32);
8674   if (UseBMI2Instructions) {
8675     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8676   }
8677   else {
8678     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8679   }
8680   // Store back in big endian from little endian
8681   rorq(sum, 0x20);
8682   movq(Address(out, offset, Address::times_4,  0), sum);
8683 
8684   jmp(L_first_loop);
8685   bind(L_first_loop_exit);
8686 }
8687 
8688 /**
8689  * Code for BigInteger::mulAdd() intrinsic
8690  *
8691  * rdi: out
8692  * rsi: in
8693  * r11: offs (out.length - offset)
8694  * rcx: len
8695  * r8:  k
8696  * r12: tmp1
8697  * r13: tmp2
8698  * r14: tmp3
8699  * r15: tmp4
8700  * rbx: tmp5
8701  * Multiply the in[] by word k and add to out[], return the carry in rax
8702  */
8703 void MacroAssembler::mul_add(Register out, Register in, Register offs,
8704    Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
8705    Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8706 
8707   Label L_carry, L_last_in, L_done;
8708 
8709 // carry = 0;
8710 // for (int j=len-1; j >= 0; j--) {
8711 //    long product = (in[j] & LONG_MASK) * kLong +
8712 //                   (out[offs] & LONG_MASK) + carry;
8713 //    out[offs--] = (int)product;
8714 //    carry = product >>> 32;
8715 // }
8716 //
8717   push(tmp1);
8718   push(tmp2);
8719   push(tmp3);
8720   push(tmp4);
8721   push(tmp5);
8722 
8723   Register op2 = tmp2;
8724   const Register sum = tmp3;
8725   const Register op1 = tmp4;
8726   const Register carry =  tmp5;
8727 
8728   if (UseBMI2Instructions) {
8729     op2 = rdxReg;
8730     movl(op2, k);
8731   }
8732   else {
8733     movl(op2, k);
8734   }
8735 
8736   xorq(carry, carry);
8737 
8738   //First loop
8739 
8740   //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
8741   //The carry is in tmp5
8742   mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
8743 
8744   //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
8745   decrementl(len);
8746   jccb(Assembler::negative, L_carry);
8747   decrementl(len);
8748   jccb(Assembler::negative, L_last_in);
8749 
8750   movq(op1, Address(in, len, Address::times_4,  0));
8751   rorq(op1, 32);
8752 
8753   subl(offs, 2);
8754   movq(sum, Address(out, offs, Address::times_4,  0));
8755   rorq(sum, 32);
8756 
8757   if (UseBMI2Instructions) {
8758     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8759   }
8760   else {
8761     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8762   }
8763 
8764   // Store back in big endian from little endian
8765   rorq(sum, 0x20);
8766   movq(Address(out, offs, Address::times_4,  0), sum);
8767 
8768   testl(len, len);
8769   jccb(Assembler::zero, L_carry);
8770 
8771   //Multiply the last in[] entry, if any
8772   bind(L_last_in);
8773   movl(op1, Address(in, 0));
8774   movl(sum, Address(out, offs, Address::times_4,  -4));
8775 
8776   movl(raxReg, k);
8777   mull(op1); //tmp4 * eax -> edx:eax
8778   addl(sum, carry);
8779   adcl(rdxReg, 0);
8780   addl(sum, raxReg);
8781   adcl(rdxReg, 0);
8782   movl(carry, rdxReg);
8783 
8784   movl(Address(out, offs, Address::times_4,  -4), sum);
8785 
8786   bind(L_carry);
8787   //return tmp5/carry as carry in rax
8788   movl(rax, carry);
8789 
8790   bind(L_done);
8791   pop(tmp5);
8792   pop(tmp4);
8793   pop(tmp3);
8794   pop(tmp2);
8795   pop(tmp1);
8796 }
8797 #endif
8798 
8799 /**
8800  * Emits code to update CRC-32 with a byte value according to constants in table
8801  *
8802  * @param [in,out]crc   Register containing the crc.
8803  * @param [in]val       Register containing the byte to fold into the CRC.
8804  * @param [in]table     Register containing the table of crc constants.
8805  *
8806  * uint32_t crc;
8807  * val = crc_table[(val ^ crc) & 0xFF];
8808  * crc = val ^ (crc >> 8);
8809  *
8810  */
8811 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
8812   xorl(val, crc);
8813   andl(val, 0xFF);
8814   shrl(crc, 8); // unsigned shift
8815   xorl(crc, Address(table, val, Address::times_4, 0));
8816 }
8817 
8818 /**
8819 * Fold four 128-bit data chunks
8820 */
8821 void MacroAssembler::fold_128bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
8822   evpclmulhdq(xtmp, xK, xcrc, Assembler::AVX_512bit); // [123:64]
8823   evpclmulldq(xcrc, xK, xcrc, Assembler::AVX_512bit); // [63:0]
8824   evpxorq(xcrc, xcrc, Address(buf, offset), Assembler::AVX_512bit /* vector_len */);
8825   evpxorq(xcrc, xcrc, xtmp, Assembler::AVX_512bit /* vector_len */);
8826 }
8827 
8828 /**
8829  * Fold 128-bit data chunk
8830  */
8831 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
8832   if (UseAVX > 0) {
8833     vpclmulhdq(xtmp, xK, xcrc); // [123:64]
8834     vpclmulldq(xcrc, xK, xcrc); // [63:0]
8835     vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
8836     pxor(xcrc, xtmp);
8837   } else {
8838     movdqa(xtmp, xcrc);
8839     pclmulhdq(xtmp, xK);   // [123:64]
8840     pclmulldq(xcrc, xK);   // [63:0]
8841     pxor(xcrc, xtmp);
8842     movdqu(xtmp, Address(buf, offset));
8843     pxor(xcrc, xtmp);
8844   }
8845 }
8846 
8847 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
8848   if (UseAVX > 0) {
8849     vpclmulhdq(xtmp, xK, xcrc);
8850     vpclmulldq(xcrc, xK, xcrc);
8851     pxor(xcrc, xbuf);
8852     pxor(xcrc, xtmp);
8853   } else {
8854     movdqa(xtmp, xcrc);
8855     pclmulhdq(xtmp, xK);
8856     pclmulldq(xcrc, xK);
8857     pxor(xcrc, xbuf);
8858     pxor(xcrc, xtmp);
8859   }
8860 }
8861 
8862 /**
8863  * 8-bit folds to compute 32-bit CRC
8864  *
8865  * uint64_t xcrc;
8866  * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
8867  */
8868 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
8869   movdl(tmp, xcrc);
8870   andl(tmp, 0xFF);
8871   movdl(xtmp, Address(table, tmp, Address::times_4, 0));
8872   psrldq(xcrc, 1); // unsigned shift one byte
8873   pxor(xcrc, xtmp);
8874 }
8875 
8876 /**
8877  * uint32_t crc;
8878  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
8879  */
8880 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
8881   movl(tmp, crc);
8882   andl(tmp, 0xFF);
8883   shrl(crc, 8);
8884   xorl(crc, Address(table, tmp, Address::times_4, 0));
8885 }
8886 
8887 /**
8888  * @param crc   register containing existing CRC (32-bit)
8889  * @param buf   register pointing to input byte buffer (byte*)
8890  * @param len   register containing number of bytes
8891  * @param table register that will contain address of CRC table
8892  * @param tmp   scratch register
8893  */
8894 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
8895   assert_different_registers(crc, buf, len, table, tmp, rax);
8896 
8897   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
8898   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
8899 
8900   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
8901   // context for the registers used, where all instructions below are using 128-bit mode
8902   // On EVEX without VL and BW, these instructions will all be AVX.
8903   lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
8904   notl(crc); // ~crc
8905   cmpl(len, 16);
8906   jcc(Assembler::less, L_tail);
8907 
8908   // Align buffer to 16 bytes
8909   movl(tmp, buf);
8910   andl(tmp, 0xF);
8911   jccb(Assembler::zero, L_aligned);
8912   subl(tmp,  16);
8913   addl(len, tmp);
8914 
8915   align(4);
8916   BIND(L_align_loop);
8917   movsbl(rax, Address(buf, 0)); // load byte with sign extension
8918   update_byte_crc32(crc, rax, table);
8919   increment(buf);
8920   incrementl(tmp);
8921   jccb(Assembler::less, L_align_loop);
8922 
8923   BIND(L_aligned);
8924   movl(tmp, len); // save
8925   shrl(len, 4);
8926   jcc(Assembler::zero, L_tail_restore);
8927 
8928   // Fold total 512 bits of polynomial on each iteration
8929   if (VM_Version::supports_vpclmulqdq()) {
8930     Label Parallel_loop, L_No_Parallel;
8931 
8932     cmpl(len, 8);
8933     jccb(Assembler::less, L_No_Parallel);
8934 
8935     movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
8936     evmovdquq(xmm1, Address(buf, 0), Assembler::AVX_512bit);
8937     movdl(xmm5, crc);
8938     evpxorq(xmm1, xmm1, xmm5, Assembler::AVX_512bit);
8939     addptr(buf, 64);
8940     subl(len, 7);
8941     evshufi64x2(xmm0, xmm0, xmm0, 0x00, Assembler::AVX_512bit); //propagate the mask from 128 bits to 512 bits
8942 
8943     BIND(Parallel_loop);
8944     fold_128bit_crc32_avx512(xmm1, xmm0, xmm5, buf, 0);
8945     addptr(buf, 64);
8946     subl(len, 4);
8947     jcc(Assembler::greater, Parallel_loop);
8948 
8949     vextracti64x2(xmm2, xmm1, 0x01);
8950     vextracti64x2(xmm3, xmm1, 0x02);
8951     vextracti64x2(xmm4, xmm1, 0x03);
8952     jmp(L_fold_512b);
8953 
8954     BIND(L_No_Parallel);
8955   }
8956   // Fold crc into first bytes of vector
8957   movdqa(xmm1, Address(buf, 0));
8958   movdl(rax, xmm1);
8959   xorl(crc, rax);
8960   if (VM_Version::supports_sse4_1()) {
8961     pinsrd(xmm1, crc, 0);
8962   } else {
8963     pinsrw(xmm1, crc, 0);
8964     shrl(crc, 16);
8965     pinsrw(xmm1, crc, 1);
8966   }
8967   addptr(buf, 16);
8968   subl(len, 4); // len > 0
8969   jcc(Assembler::less, L_fold_tail);
8970 
8971   movdqa(xmm2, Address(buf,  0));
8972   movdqa(xmm3, Address(buf, 16));
8973   movdqa(xmm4, Address(buf, 32));
8974   addptr(buf, 48);
8975   subl(len, 3);
8976   jcc(Assembler::lessEqual, L_fold_512b);
8977 
8978   // Fold total 512 bits of polynomial on each iteration,
8979   // 128 bits per each of 4 parallel streams.
8980   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
8981 
8982   align(32);
8983   BIND(L_fold_512b_loop);
8984   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
8985   fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
8986   fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
8987   fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
8988   addptr(buf, 64);
8989   subl(len, 4);
8990   jcc(Assembler::greater, L_fold_512b_loop);
8991 
8992   // Fold 512 bits to 128 bits.
8993   BIND(L_fold_512b);
8994   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
8995   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
8996   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
8997   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
8998 
8999   // Fold the rest of 128 bits data chunks
9000   BIND(L_fold_tail);
9001   addl(len, 3);
9002   jccb(Assembler::lessEqual, L_fold_128b);
9003   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
9004 
9005   BIND(L_fold_tail_loop);
9006   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
9007   addptr(buf, 16);
9008   decrementl(len);
9009   jccb(Assembler::greater, L_fold_tail_loop);
9010 
9011   // Fold 128 bits in xmm1 down into 32 bits in crc register.
9012   BIND(L_fold_128b);
9013   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
9014   if (UseAVX > 0) {
9015     vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
9016     vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
9017     vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
9018   } else {
9019     movdqa(xmm2, xmm0);
9020     pclmulqdq(xmm2, xmm1, 0x1);
9021     movdqa(xmm3, xmm0);
9022     pand(xmm3, xmm2);
9023     pclmulqdq(xmm0, xmm3, 0x1);
9024   }
9025   psrldq(xmm1, 8);
9026   psrldq(xmm2, 4);
9027   pxor(xmm0, xmm1);
9028   pxor(xmm0, xmm2);
9029 
9030   // 8 8-bit folds to compute 32-bit CRC.
9031   for (int j = 0; j < 4; j++) {
9032     fold_8bit_crc32(xmm0, table, xmm1, rax);
9033   }
9034   movdl(crc, xmm0); // mov 32 bits to general register
9035   for (int j = 0; j < 4; j++) {
9036     fold_8bit_crc32(crc, table, rax);
9037   }
9038 
9039   BIND(L_tail_restore);
9040   movl(len, tmp); // restore
9041   BIND(L_tail);
9042   andl(len, 0xf);
9043   jccb(Assembler::zero, L_exit);
9044 
9045   // Fold the rest of bytes
9046   align(4);
9047   BIND(L_tail_loop);
9048   movsbl(rax, Address(buf, 0)); // load byte with sign extension
9049   update_byte_crc32(crc, rax, table);
9050   increment(buf);
9051   decrementl(len);
9052   jccb(Assembler::greater, L_tail_loop);
9053 
9054   BIND(L_exit);
9055   notl(crc); // ~c
9056 }
9057 
9058 #ifdef _LP64
9059 // S. Gueron / Information Processing Letters 112 (2012) 184
9060 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
9061 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
9062 // Output: the 64-bit carry-less product of B * CONST
9063 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
9064                                      Register tmp1, Register tmp2, Register tmp3) {
9065   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
9066   if (n > 0) {
9067     addq(tmp3, n * 256 * 8);
9068   }
9069   //    Q1 = TABLEExt[n][B & 0xFF];
9070   movl(tmp1, in);
9071   andl(tmp1, 0x000000FF);
9072   shll(tmp1, 3);
9073   addq(tmp1, tmp3);
9074   movq(tmp1, Address(tmp1, 0));
9075 
9076   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
9077   movl(tmp2, in);
9078   shrl(tmp2, 8);
9079   andl(tmp2, 0x000000FF);
9080   shll(tmp2, 3);
9081   addq(tmp2, tmp3);
9082   movq(tmp2, Address(tmp2, 0));
9083 
9084   shlq(tmp2, 8);
9085   xorq(tmp1, tmp2);
9086 
9087   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
9088   movl(tmp2, in);
9089   shrl(tmp2, 16);
9090   andl(tmp2, 0x000000FF);
9091   shll(tmp2, 3);
9092   addq(tmp2, tmp3);
9093   movq(tmp2, Address(tmp2, 0));
9094 
9095   shlq(tmp2, 16);
9096   xorq(tmp1, tmp2);
9097 
9098   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
9099   shrl(in, 24);
9100   andl(in, 0x000000FF);
9101   shll(in, 3);
9102   addq(in, tmp3);
9103   movq(in, Address(in, 0));
9104 
9105   shlq(in, 24);
9106   xorq(in, tmp1);
9107   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
9108 }
9109 
9110 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
9111                                       Register in_out,
9112                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
9113                                       XMMRegister w_xtmp2,
9114                                       Register tmp1,
9115                                       Register n_tmp2, Register n_tmp3) {
9116   if (is_pclmulqdq_supported) {
9117     movdl(w_xtmp1, in_out); // modified blindly
9118 
9119     movl(tmp1, const_or_pre_comp_const_index);
9120     movdl(w_xtmp2, tmp1);
9121     pclmulqdq(w_xtmp1, w_xtmp2, 0);
9122 
9123     movdq(in_out, w_xtmp1);
9124   } else {
9125     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
9126   }
9127 }
9128 
9129 // Recombination Alternative 2: No bit-reflections
9130 // T1 = (CRC_A * U1) << 1
9131 // T2 = (CRC_B * U2) << 1
9132 // C1 = T1 >> 32
9133 // C2 = T2 >> 32
9134 // T1 = T1 & 0xFFFFFFFF
9135 // T2 = T2 & 0xFFFFFFFF
9136 // T1 = CRC32(0, T1)
9137 // T2 = CRC32(0, T2)
9138 // C1 = C1 ^ T1
9139 // C2 = C2 ^ T2
9140 // CRC = C1 ^ C2 ^ CRC_C
9141 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
9142                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9143                                      Register tmp1, Register tmp2,
9144                                      Register n_tmp3) {
9145   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9146   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9147   shlq(in_out, 1);
9148   movl(tmp1, in_out);
9149   shrq(in_out, 32);
9150   xorl(tmp2, tmp2);
9151   crc32(tmp2, tmp1, 4);
9152   xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
9153   shlq(in1, 1);
9154   movl(tmp1, in1);
9155   shrq(in1, 32);
9156   xorl(tmp2, tmp2);
9157   crc32(tmp2, tmp1, 4);
9158   xorl(in1, tmp2);
9159   xorl(in_out, in1);
9160   xorl(in_out, in2);
9161 }
9162 
9163 // Set N to predefined value
9164 // Subtract from a lenght of a buffer
9165 // execute in a loop:
9166 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
9167 // for i = 1 to N do
9168 //  CRC_A = CRC32(CRC_A, A[i])
9169 //  CRC_B = CRC32(CRC_B, B[i])
9170 //  CRC_C = CRC32(CRC_C, C[i])
9171 // end for
9172 // Recombine
9173 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
9174                                        Register in_out1, Register in_out2, Register in_out3,
9175                                        Register tmp1, Register tmp2, Register tmp3,
9176                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9177                                        Register tmp4, Register tmp5,
9178                                        Register n_tmp6) {
9179   Label L_processPartitions;
9180   Label L_processPartition;
9181   Label L_exit;
9182 
9183   bind(L_processPartitions);
9184   cmpl(in_out1, 3 * size);
9185   jcc(Assembler::less, L_exit);
9186     xorl(tmp1, tmp1);
9187     xorl(tmp2, tmp2);
9188     movq(tmp3, in_out2);
9189     addq(tmp3, size);
9190 
9191     bind(L_processPartition);
9192       crc32(in_out3, Address(in_out2, 0), 8);
9193       crc32(tmp1, Address(in_out2, size), 8);
9194       crc32(tmp2, Address(in_out2, size * 2), 8);
9195       addq(in_out2, 8);
9196       cmpq(in_out2, tmp3);
9197       jcc(Assembler::less, L_processPartition);
9198     crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
9199             w_xtmp1, w_xtmp2, w_xtmp3,
9200             tmp4, tmp5,
9201             n_tmp6);
9202     addq(in_out2, 2 * size);
9203     subl(in_out1, 3 * size);
9204     jmp(L_processPartitions);
9205 
9206   bind(L_exit);
9207 }
9208 #else
9209 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
9210                                      Register tmp1, Register tmp2, Register tmp3,
9211                                      XMMRegister xtmp1, XMMRegister xtmp2) {
9212   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
9213   if (n > 0) {
9214     addl(tmp3, n * 256 * 8);
9215   }
9216   //    Q1 = TABLEExt[n][B & 0xFF];
9217   movl(tmp1, in_out);
9218   andl(tmp1, 0x000000FF);
9219   shll(tmp1, 3);
9220   addl(tmp1, tmp3);
9221   movq(xtmp1, Address(tmp1, 0));
9222 
9223   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
9224   movl(tmp2, in_out);
9225   shrl(tmp2, 8);
9226   andl(tmp2, 0x000000FF);
9227   shll(tmp2, 3);
9228   addl(tmp2, tmp3);
9229   movq(xtmp2, Address(tmp2, 0));
9230 
9231   psllq(xtmp2, 8);
9232   pxor(xtmp1, xtmp2);
9233 
9234   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
9235   movl(tmp2, in_out);
9236   shrl(tmp2, 16);
9237   andl(tmp2, 0x000000FF);
9238   shll(tmp2, 3);
9239   addl(tmp2, tmp3);
9240   movq(xtmp2, Address(tmp2, 0));
9241 
9242   psllq(xtmp2, 16);
9243   pxor(xtmp1, xtmp2);
9244 
9245   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
9246   shrl(in_out, 24);
9247   andl(in_out, 0x000000FF);
9248   shll(in_out, 3);
9249   addl(in_out, tmp3);
9250   movq(xtmp2, Address(in_out, 0));
9251 
9252   psllq(xtmp2, 24);
9253   pxor(xtmp1, xtmp2); // Result in CXMM
9254   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
9255 }
9256 
9257 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
9258                                       Register in_out,
9259                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
9260                                       XMMRegister w_xtmp2,
9261                                       Register tmp1,
9262                                       Register n_tmp2, Register n_tmp3) {
9263   if (is_pclmulqdq_supported) {
9264     movdl(w_xtmp1, in_out);
9265 
9266     movl(tmp1, const_or_pre_comp_const_index);
9267     movdl(w_xtmp2, tmp1);
9268     pclmulqdq(w_xtmp1, w_xtmp2, 0);
9269     // Keep result in XMM since GPR is 32 bit in length
9270   } else {
9271     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
9272   }
9273 }
9274 
9275 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
9276                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9277                                      Register tmp1, Register tmp2,
9278                                      Register n_tmp3) {
9279   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9280   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9281 
9282   psllq(w_xtmp1, 1);
9283   movdl(tmp1, w_xtmp1);
9284   psrlq(w_xtmp1, 32);
9285   movdl(in_out, w_xtmp1);
9286 
9287   xorl(tmp2, tmp2);
9288   crc32(tmp2, tmp1, 4);
9289   xorl(in_out, tmp2);
9290 
9291   psllq(w_xtmp2, 1);
9292   movdl(tmp1, w_xtmp2);
9293   psrlq(w_xtmp2, 32);
9294   movdl(in1, w_xtmp2);
9295 
9296   xorl(tmp2, tmp2);
9297   crc32(tmp2, tmp1, 4);
9298   xorl(in1, tmp2);
9299   xorl(in_out, in1);
9300   xorl(in_out, in2);
9301 }
9302 
9303 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
9304                                        Register in_out1, Register in_out2, Register in_out3,
9305                                        Register tmp1, Register tmp2, Register tmp3,
9306                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9307                                        Register tmp4, Register tmp5,
9308                                        Register n_tmp6) {
9309   Label L_processPartitions;
9310   Label L_processPartition;
9311   Label L_exit;
9312 
9313   bind(L_processPartitions);
9314   cmpl(in_out1, 3 * size);
9315   jcc(Assembler::less, L_exit);
9316     xorl(tmp1, tmp1);
9317     xorl(tmp2, tmp2);
9318     movl(tmp3, in_out2);
9319     addl(tmp3, size);
9320 
9321     bind(L_processPartition);
9322       crc32(in_out3, Address(in_out2, 0), 4);
9323       crc32(tmp1, Address(in_out2, size), 4);
9324       crc32(tmp2, Address(in_out2, size*2), 4);
9325       crc32(in_out3, Address(in_out2, 0+4), 4);
9326       crc32(tmp1, Address(in_out2, size+4), 4);
9327       crc32(tmp2, Address(in_out2, size*2+4), 4);
9328       addl(in_out2, 8);
9329       cmpl(in_out2, tmp3);
9330       jcc(Assembler::less, L_processPartition);
9331 
9332         push(tmp3);
9333         push(in_out1);
9334         push(in_out2);
9335         tmp4 = tmp3;
9336         tmp5 = in_out1;
9337         n_tmp6 = in_out2;
9338 
9339       crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
9340             w_xtmp1, w_xtmp2, w_xtmp3,
9341             tmp4, tmp5,
9342             n_tmp6);
9343 
9344         pop(in_out2);
9345         pop(in_out1);
9346         pop(tmp3);
9347 
9348     addl(in_out2, 2 * size);
9349     subl(in_out1, 3 * size);
9350     jmp(L_processPartitions);
9351 
9352   bind(L_exit);
9353 }
9354 #endif //LP64
9355 
9356 #ifdef _LP64
9357 // Algorithm 2: Pipelined usage of the CRC32 instruction.
9358 // Input: A buffer I of L bytes.
9359 // Output: the CRC32C value of the buffer.
9360 // Notations:
9361 // Write L = 24N + r, with N = floor (L/24).
9362 // r = L mod 24 (0 <= r < 24).
9363 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
9364 // N quadwords, and R consists of r bytes.
9365 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
9366 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
9367 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
9368 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
9369 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
9370                                           Register tmp1, Register tmp2, Register tmp3,
9371                                           Register tmp4, Register tmp5, Register tmp6,
9372                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9373                                           bool is_pclmulqdq_supported) {
9374   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
9375   Label L_wordByWord;
9376   Label L_byteByByteProlog;
9377   Label L_byteByByte;
9378   Label L_exit;
9379 
9380   if (is_pclmulqdq_supported ) {
9381     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
9382     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
9383 
9384     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
9385     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
9386 
9387     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
9388     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
9389     assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
9390   } else {
9391     const_or_pre_comp_const_index[0] = 1;
9392     const_or_pre_comp_const_index[1] = 0;
9393 
9394     const_or_pre_comp_const_index[2] = 3;
9395     const_or_pre_comp_const_index[3] = 2;
9396 
9397     const_or_pre_comp_const_index[4] = 5;
9398     const_or_pre_comp_const_index[5] = 4;
9399    }
9400   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
9401                     in2, in1, in_out,
9402                     tmp1, tmp2, tmp3,
9403                     w_xtmp1, w_xtmp2, w_xtmp3,
9404                     tmp4, tmp5,
9405                     tmp6);
9406   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
9407                     in2, in1, in_out,
9408                     tmp1, tmp2, tmp3,
9409                     w_xtmp1, w_xtmp2, w_xtmp3,
9410                     tmp4, tmp5,
9411                     tmp6);
9412   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
9413                     in2, in1, in_out,
9414                     tmp1, tmp2, tmp3,
9415                     w_xtmp1, w_xtmp2, w_xtmp3,
9416                     tmp4, tmp5,
9417                     tmp6);
9418   movl(tmp1, in2);
9419   andl(tmp1, 0x00000007);
9420   negl(tmp1);
9421   addl(tmp1, in2);
9422   addq(tmp1, in1);
9423 
9424   BIND(L_wordByWord);
9425   cmpq(in1, tmp1);
9426   jcc(Assembler::greaterEqual, L_byteByByteProlog);
9427     crc32(in_out, Address(in1, 0), 4);
9428     addq(in1, 4);
9429     jmp(L_wordByWord);
9430 
9431   BIND(L_byteByByteProlog);
9432   andl(in2, 0x00000007);
9433   movl(tmp2, 1);
9434 
9435   BIND(L_byteByByte);
9436   cmpl(tmp2, in2);
9437   jccb(Assembler::greater, L_exit);
9438     crc32(in_out, Address(in1, 0), 1);
9439     incq(in1);
9440     incl(tmp2);
9441     jmp(L_byteByByte);
9442 
9443   BIND(L_exit);
9444 }
9445 #else
9446 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
9447                                           Register tmp1, Register  tmp2, Register tmp3,
9448                                           Register tmp4, Register  tmp5, Register tmp6,
9449                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9450                                           bool is_pclmulqdq_supported) {
9451   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
9452   Label L_wordByWord;
9453   Label L_byteByByteProlog;
9454   Label L_byteByByte;
9455   Label L_exit;
9456 
9457   if (is_pclmulqdq_supported) {
9458     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
9459     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
9460 
9461     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
9462     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
9463 
9464     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
9465     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
9466   } else {
9467     const_or_pre_comp_const_index[0] = 1;
9468     const_or_pre_comp_const_index[1] = 0;
9469 
9470     const_or_pre_comp_const_index[2] = 3;
9471     const_or_pre_comp_const_index[3] = 2;
9472 
9473     const_or_pre_comp_const_index[4] = 5;
9474     const_or_pre_comp_const_index[5] = 4;
9475   }
9476   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
9477                     in2, in1, in_out,
9478                     tmp1, tmp2, tmp3,
9479                     w_xtmp1, w_xtmp2, w_xtmp3,
9480                     tmp4, tmp5,
9481                     tmp6);
9482   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
9483                     in2, in1, in_out,
9484                     tmp1, tmp2, tmp3,
9485                     w_xtmp1, w_xtmp2, w_xtmp3,
9486                     tmp4, tmp5,
9487                     tmp6);
9488   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
9489                     in2, in1, in_out,
9490                     tmp1, tmp2, tmp3,
9491                     w_xtmp1, w_xtmp2, w_xtmp3,
9492                     tmp4, tmp5,
9493                     tmp6);
9494   movl(tmp1, in2);
9495   andl(tmp1, 0x00000007);
9496   negl(tmp1);
9497   addl(tmp1, in2);
9498   addl(tmp1, in1);
9499 
9500   BIND(L_wordByWord);
9501   cmpl(in1, tmp1);
9502   jcc(Assembler::greaterEqual, L_byteByByteProlog);
9503     crc32(in_out, Address(in1,0), 4);
9504     addl(in1, 4);
9505     jmp(L_wordByWord);
9506 
9507   BIND(L_byteByByteProlog);
9508   andl(in2, 0x00000007);
9509   movl(tmp2, 1);
9510 
9511   BIND(L_byteByByte);
9512   cmpl(tmp2, in2);
9513   jccb(Assembler::greater, L_exit);
9514     movb(tmp1, Address(in1, 0));
9515     crc32(in_out, tmp1, 1);
9516     incl(in1);
9517     incl(tmp2);
9518     jmp(L_byteByByte);
9519 
9520   BIND(L_exit);
9521 }
9522 #endif // LP64
9523 #undef BIND
9524 #undef BLOCK_COMMENT
9525 
9526 // Compress char[] array to byte[].
9527 //   ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
9528 //   @HotSpotIntrinsicCandidate
9529 //   private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
9530 //     for (int i = 0; i < len; i++) {
9531 //       int c = src[srcOff++];
9532 //       if (c >>> 8 != 0) {
9533 //         return 0;
9534 //       }
9535 //       dst[dstOff++] = (byte)c;
9536 //     }
9537 //     return len;
9538 //   }
9539 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
9540   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
9541   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
9542   Register tmp5, Register result) {
9543   Label copy_chars_loop, return_length, return_zero, done;
9544 
9545   // rsi: src
9546   // rdi: dst
9547   // rdx: len
9548   // rcx: tmp5
9549   // rax: result
9550 
9551   // rsi holds start addr of source char[] to be compressed
9552   // rdi holds start addr of destination byte[]
9553   // rdx holds length
9554 
9555   assert(len != result, "");
9556 
9557   // save length for return
9558   push(len);
9559 
9560   if ((UseAVX > 2) && // AVX512
9561     VM_Version::supports_avx512vlbw() &&
9562     VM_Version::supports_bmi2()) {
9563 
9564     Label copy_32_loop, copy_loop_tail, below_threshold;
9565 
9566     // alignment
9567     Label post_alignment;
9568 
9569     // if length of the string is less than 16, handle it in an old fashioned way
9570     testl(len, -32);
9571     jcc(Assembler::zero, below_threshold);
9572 
9573     // First check whether a character is compressable ( <= 0xFF).
9574     // Create mask to test for Unicode chars inside zmm vector
9575     movl(result, 0x00FF);
9576     evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
9577 
9578     testl(len, -64);
9579     jcc(Assembler::zero, post_alignment);
9580 
9581     movl(tmp5, dst);
9582     andl(tmp5, (32 - 1));
9583     negl(tmp5);
9584     andl(tmp5, (32 - 1));
9585 
9586     // bail out when there is nothing to be done
9587     testl(tmp5, 0xFFFFFFFF);
9588     jcc(Assembler::zero, post_alignment);
9589 
9590     // ~(~0 << len), where len is the # of remaining elements to process
9591     movl(result, 0xFFFFFFFF);
9592     shlxl(result, result, tmp5);
9593     notl(result);
9594     kmovdl(k3, result);
9595 
9596     evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit);
9597     evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
9598     ktestd(k2, k3);
9599     jcc(Assembler::carryClear, return_zero);
9600 
9601     evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit);
9602 
9603     addptr(src, tmp5);
9604     addptr(src, tmp5);
9605     addptr(dst, tmp5);
9606     subl(len, tmp5);
9607 
9608     bind(post_alignment);
9609     // end of alignment
9610 
9611     movl(tmp5, len);
9612     andl(tmp5, (32 - 1));    // tail count (in chars)
9613     andl(len, ~(32 - 1));    // vector count (in chars)
9614     jcc(Assembler::zero, copy_loop_tail);
9615 
9616     lea(src, Address(src, len, Address::times_2));
9617     lea(dst, Address(dst, len, Address::times_1));
9618     negptr(len);
9619 
9620     bind(copy_32_loop);
9621     evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
9622     evpcmpuw(k2, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
9623     kortestdl(k2, k2);
9624     jcc(Assembler::carryClear, return_zero);
9625 
9626     // All elements in current processed chunk are valid candidates for
9627     // compression. Write a truncated byte elements to the memory.
9628     evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
9629     addptr(len, 32);
9630     jcc(Assembler::notZero, copy_32_loop);
9631 
9632     bind(copy_loop_tail);
9633     // bail out when there is nothing to be done
9634     testl(tmp5, 0xFFFFFFFF);
9635     jcc(Assembler::zero, return_length);
9636 
9637     movl(len, tmp5);
9638 
9639     // ~(~0 << len), where len is the # of remaining elements to process
9640     movl(result, 0xFFFFFFFF);
9641     shlxl(result, result, len);
9642     notl(result);
9643 
9644     kmovdl(k3, result);
9645 
9646     evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit);
9647     evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
9648     ktestd(k2, k3);
9649     jcc(Assembler::carryClear, return_zero);
9650 
9651     evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit);
9652     jmp(return_length);
9653 
9654     bind(below_threshold);
9655   }
9656 
9657   if (UseSSE42Intrinsics) {
9658     Label copy_32_loop, copy_16, copy_tail;
9659 
9660     movl(result, len);
9661 
9662     movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
9663 
9664     // vectored compression
9665     andl(len, 0xfffffff0);    // vector count (in chars)
9666     andl(result, 0x0000000f);    // tail count (in chars)
9667     testl(len, len);
9668     jcc(Assembler::zero, copy_16);
9669 
9670     // compress 16 chars per iter
9671     movdl(tmp1Reg, tmp5);
9672     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
9673     pxor(tmp4Reg, tmp4Reg);
9674 
9675     lea(src, Address(src, len, Address::times_2));
9676     lea(dst, Address(dst, len, Address::times_1));
9677     negptr(len);
9678 
9679     bind(copy_32_loop);
9680     movdqu(tmp2Reg, Address(src, len, Address::times_2));     // load 1st 8 characters
9681     por(tmp4Reg, tmp2Reg);
9682     movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
9683     por(tmp4Reg, tmp3Reg);
9684     ptest(tmp4Reg, tmp1Reg);       // check for Unicode chars in next vector
9685     jcc(Assembler::notZero, return_zero);
9686     packuswb(tmp2Reg, tmp3Reg);    // only ASCII chars; compress each to 1 byte
9687     movdqu(Address(dst, len, Address::times_1), tmp2Reg);
9688     addptr(len, 16);
9689     jcc(Assembler::notZero, copy_32_loop);
9690 
9691     // compress next vector of 8 chars (if any)
9692     bind(copy_16);
9693     movl(len, result);
9694     andl(len, 0xfffffff8);    // vector count (in chars)
9695     andl(result, 0x00000007);    // tail count (in chars)
9696     testl(len, len);
9697     jccb(Assembler::zero, copy_tail);
9698 
9699     movdl(tmp1Reg, tmp5);
9700     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
9701     pxor(tmp3Reg, tmp3Reg);
9702 
9703     movdqu(tmp2Reg, Address(src, 0));
9704     ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in vector
9705     jccb(Assembler::notZero, return_zero);
9706     packuswb(tmp2Reg, tmp3Reg);    // only LATIN1 chars; compress each to 1 byte
9707     movq(Address(dst, 0), tmp2Reg);
9708     addptr(src, 16);
9709     addptr(dst, 8);
9710 
9711     bind(copy_tail);
9712     movl(len, result);
9713   }
9714   // compress 1 char per iter
9715   testl(len, len);
9716   jccb(Assembler::zero, return_length);
9717   lea(src, Address(src, len, Address::times_2));
9718   lea(dst, Address(dst, len, Address::times_1));
9719   negptr(len);
9720 
9721   bind(copy_chars_loop);
9722   load_unsigned_short(result, Address(src, len, Address::times_2));
9723   testl(result, 0xff00);      // check if Unicode char
9724   jccb(Assembler::notZero, return_zero);
9725   movb(Address(dst, len, Address::times_1), result);  // ASCII char; compress to 1 byte
9726   increment(len);
9727   jcc(Assembler::notZero, copy_chars_loop);
9728 
9729   // if compression succeeded, return length
9730   bind(return_length);
9731   pop(result);
9732   jmpb(done);
9733 
9734   // if compression failed, return 0
9735   bind(return_zero);
9736   xorl(result, result);
9737   addptr(rsp, wordSize);
9738 
9739   bind(done);
9740 }
9741 
9742 // Inflate byte[] array to char[].
9743 //   ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
9744 //   @HotSpotIntrinsicCandidate
9745 //   private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
9746 //     for (int i = 0; i < len; i++) {
9747 //       dst[dstOff++] = (char)(src[srcOff++] & 0xff);
9748 //     }
9749 //   }
9750 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
9751   XMMRegister tmp1, Register tmp2) {
9752   Label copy_chars_loop, done, below_threshold;
9753   // rsi: src
9754   // rdi: dst
9755   // rdx: len
9756   // rcx: tmp2
9757 
9758   // rsi holds start addr of source byte[] to be inflated
9759   // rdi holds start addr of destination char[]
9760   // rdx holds length
9761   assert_different_registers(src, dst, len, tmp2);
9762 
9763   if ((UseAVX > 2) && // AVX512
9764     VM_Version::supports_avx512vlbw() &&
9765     VM_Version::supports_bmi2()) {
9766 
9767     Label copy_32_loop, copy_tail;
9768     Register tmp3_aliased = len;
9769 
9770     // if length of the string is less than 16, handle it in an old fashioned way
9771     testl(len, -16);
9772     jcc(Assembler::zero, below_threshold);
9773 
9774     // In order to use only one arithmetic operation for the main loop we use
9775     // this pre-calculation
9776     movl(tmp2, len);
9777     andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
9778     andl(len, -32);     // vector count
9779     jccb(Assembler::zero, copy_tail);
9780 
9781     lea(src, Address(src, len, Address::times_1));
9782     lea(dst, Address(dst, len, Address::times_2));
9783     negptr(len);
9784 
9785 
9786     // inflate 32 chars per iter
9787     bind(copy_32_loop);
9788     vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
9789     evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
9790     addptr(len, 32);
9791     jcc(Assembler::notZero, copy_32_loop);
9792 
9793     bind(copy_tail);
9794     // bail out when there is nothing to be done
9795     testl(tmp2, -1); // we don't destroy the contents of tmp2 here
9796     jcc(Assembler::zero, done);
9797 
9798     // ~(~0 << length), where length is the # of remaining elements to process
9799     movl(tmp3_aliased, -1);
9800     shlxl(tmp3_aliased, tmp3_aliased, tmp2);
9801     notl(tmp3_aliased);
9802     kmovdl(k2, tmp3_aliased);
9803     evpmovzxbw(tmp1, k2, Address(src, 0), Assembler::AVX_512bit);
9804     evmovdquw(Address(dst, 0), k2, tmp1, Assembler::AVX_512bit);
9805 
9806     jmp(done);
9807   }
9808   if (UseSSE42Intrinsics) {
9809     Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
9810 
9811     movl(tmp2, len);
9812 
9813     if (UseAVX > 1) {
9814       andl(tmp2, (16 - 1));
9815       andl(len, -16);
9816       jccb(Assembler::zero, copy_new_tail);
9817     } else {
9818       andl(tmp2, 0x00000007);   // tail count (in chars)
9819       andl(len, 0xfffffff8);    // vector count (in chars)
9820       jccb(Assembler::zero, copy_tail);
9821     }
9822 
9823     // vectored inflation
9824     lea(src, Address(src, len, Address::times_1));
9825     lea(dst, Address(dst, len, Address::times_2));
9826     negptr(len);
9827 
9828     if (UseAVX > 1) {
9829       bind(copy_16_loop);
9830       vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
9831       vmovdqu(Address(dst, len, Address::times_2), tmp1);
9832       addptr(len, 16);
9833       jcc(Assembler::notZero, copy_16_loop);
9834 
9835       bind(below_threshold);
9836       bind(copy_new_tail);
9837       if ((UseAVX > 2) &&
9838         VM_Version::supports_avx512vlbw() &&
9839         VM_Version::supports_bmi2()) {
9840         movl(tmp2, len);
9841       } else {
9842         movl(len, tmp2);
9843       }
9844       andl(tmp2, 0x00000007);
9845       andl(len, 0xFFFFFFF8);
9846       jccb(Assembler::zero, copy_tail);
9847 
9848       pmovzxbw(tmp1, Address(src, 0));
9849       movdqu(Address(dst, 0), tmp1);
9850       addptr(src, 8);
9851       addptr(dst, 2 * 8);
9852 
9853       jmp(copy_tail, true);
9854     }
9855 
9856     // inflate 8 chars per iter
9857     bind(copy_8_loop);
9858     pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
9859     movdqu(Address(dst, len, Address::times_2), tmp1);
9860     addptr(len, 8);
9861     jcc(Assembler::notZero, copy_8_loop);
9862 
9863     bind(copy_tail);
9864     movl(len, tmp2);
9865 
9866     cmpl(len, 4);
9867     jccb(Assembler::less, copy_bytes);
9868 
9869     movdl(tmp1, Address(src, 0));  // load 4 byte chars
9870     pmovzxbw(tmp1, tmp1);
9871     movq(Address(dst, 0), tmp1);
9872     subptr(len, 4);
9873     addptr(src, 4);
9874     addptr(dst, 8);
9875 
9876     bind(copy_bytes);
9877   } else {
9878     bind(below_threshold);
9879   }
9880 
9881   testl(len, len);
9882   jccb(Assembler::zero, done);
9883   lea(src, Address(src, len, Address::times_1));
9884   lea(dst, Address(dst, len, Address::times_2));
9885   negptr(len);
9886 
9887   // inflate 1 char per iter
9888   bind(copy_chars_loop);
9889   load_unsigned_byte(tmp2, Address(src, len, Address::times_1));  // load byte char
9890   movw(Address(dst, len, Address::times_2), tmp2);  // inflate byte char to word
9891   increment(len);
9892   jcc(Assembler::notZero, copy_chars_loop);
9893 
9894   bind(done);
9895 }
9896 
9897 #ifdef _LP64
9898 void MacroAssembler::cache_wb(Address line)
9899 {
9900   // 64 bit cpus always support clflush
9901   assert(VM_Version::supports_clflush(), "clflush should be available");
9902   bool optimized = VM_Version::supports_clflushopt();
9903   bool no_evict = VM_Version::supports_clwb();
9904 
9905   // prefer clwb (writeback without evict) otherwise
9906   // prefer clflushopt (potentially parallel writeback with evict)
9907   // otherwise fallback on clflush (serial writeback with evict)
9908 
9909   if (optimized) {
9910     if (no_evict) {
9911       clwb(line);
9912     } else {
9913       clflushopt(line);
9914     }
9915   } else {
9916     // no need for fence when using CLFLUSH
9917     clflush(line);
9918   }
9919 }
9920 
9921 void MacroAssembler::cache_wbsync(bool is_pre)
9922 {
9923   assert(VM_Version::supports_clflush(), "clflush should be available");
9924   bool optimized = VM_Version::supports_clflushopt();
9925   bool no_evict = VM_Version::supports_clwb();
9926 
9927   // pick the correct implementation
9928 
9929   if (!is_pre && (optimized || no_evict)) {
9930     // need an sfence for post flush when using clflushopt or clwb
9931     // otherwise no no need for any synchroniaztion
9932 
9933     sfence();
9934   }
9935 }
9936 #endif // _LP64
9937 
9938 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
9939   switch (cond) {
9940     // Note some conditions are synonyms for others
9941     case Assembler::zero:         return Assembler::notZero;
9942     case Assembler::notZero:      return Assembler::zero;
9943     case Assembler::less:         return Assembler::greaterEqual;
9944     case Assembler::lessEqual:    return Assembler::greater;
9945     case Assembler::greater:      return Assembler::lessEqual;
9946     case Assembler::greaterEqual: return Assembler::less;
9947     case Assembler::below:        return Assembler::aboveEqual;
9948     case Assembler::belowEqual:   return Assembler::above;
9949     case Assembler::above:        return Assembler::belowEqual;
9950     case Assembler::aboveEqual:   return Assembler::below;
9951     case Assembler::overflow:     return Assembler::noOverflow;
9952     case Assembler::noOverflow:   return Assembler::overflow;
9953     case Assembler::negative:     return Assembler::positive;
9954     case Assembler::positive:     return Assembler::negative;
9955     case Assembler::parity:       return Assembler::noParity;
9956     case Assembler::noParity:     return Assembler::parity;
9957   }
9958   ShouldNotReachHere(); return Assembler::overflow;
9959 }
9960 
9961 SkipIfEqual::SkipIfEqual(
9962     MacroAssembler* masm, const bool* flag_addr, bool value) {
9963   _masm = masm;
9964   _masm->cmp8(ExternalAddress((address)flag_addr), value);
9965   _masm->jcc(Assembler::equal, _label);
9966 }
9967 
9968 SkipIfEqual::~SkipIfEqual() {
9969   _masm->bind(_label);
9970 }
9971 
9972 // 32-bit Windows has its own fast-path implementation
9973 // of get_thread
9974 #if !defined(WIN32) || defined(_LP64)
9975 
9976 // This is simply a call to Thread::current()
9977 void MacroAssembler::get_thread(Register thread) {
9978   if (thread != rax) {
9979     push(rax);
9980   }
9981   LP64_ONLY(push(rdi);)
9982   LP64_ONLY(push(rsi);)
9983   push(rdx);
9984   push(rcx);
9985 #ifdef _LP64
9986   push(r8);
9987   push(r9);
9988   push(r10);
9989   push(r11);
9990 #endif
9991 
9992   MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
9993 
9994 #ifdef _LP64
9995   pop(r11);
9996   pop(r10);
9997   pop(r9);
9998   pop(r8);
9999 #endif
10000   pop(rcx);
10001   pop(rdx);
10002   LP64_ONLY(pop(rsi);)
10003   LP64_ONLY(pop(rdi);)
10004   if (thread != rax) {
10005     mov(thread, rax);
10006     pop(rax);
10007   }
10008 }
10009 
10010 #endif