1 /*
   2  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "jvm.h"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "compiler/disassembler.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "gc/shared/collectedHeap.inline.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 #include "memory/resourceArea.hpp"
  35 #include "memory/universe.hpp"
  36 #include "oops/accessDecorators.hpp"
  37 #include "oops/klass.inline.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/biasedLocking.hpp"
  40 #include "runtime/flags/flagSetting.hpp"
  41 #include "runtime/interfaceSupport.inline.hpp"
  42 #include "runtime/objectMonitor.hpp"
  43 #include "runtime/os.hpp"
  44 #include "runtime/safepoint.hpp"
  45 #include "runtime/safepointMechanism.hpp"
  46 #include "runtime/sharedRuntime.hpp"
  47 #include "runtime/stubRoutines.hpp"
  48 #include "runtime/thread.hpp"
  49 #include "utilities/macros.hpp"
  50 #include "crc32c.h"
  51 #ifdef COMPILER2
  52 #include "opto/intrinsicnode.hpp"
  53 #endif
  54 
  55 #ifdef PRODUCT
  56 #define BLOCK_COMMENT(str) /* nothing */
  57 #define STOP(error) stop(error)
  58 #else
  59 #define BLOCK_COMMENT(str) block_comment(str)
  60 #define STOP(error) block_comment(error); stop(error)
  61 #endif
  62 
  63 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  64 
  65 #ifdef ASSERT
  66 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
  67 #endif
  68 
  69 static Assembler::Condition reverse[] = {
  70     Assembler::noOverflow     /* overflow      = 0x0 */ ,
  71     Assembler::overflow       /* noOverflow    = 0x1 */ ,
  72     Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
  73     Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
  74     Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
  75     Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
  76     Assembler::above          /* belowEqual    = 0x6 */ ,
  77     Assembler::belowEqual     /* above         = 0x7 */ ,
  78     Assembler::positive       /* negative      = 0x8 */ ,
  79     Assembler::negative       /* positive      = 0x9 */ ,
  80     Assembler::noParity       /* parity        = 0xa */ ,
  81     Assembler::parity         /* noParity      = 0xb */ ,
  82     Assembler::greaterEqual   /* less          = 0xc */ ,
  83     Assembler::less           /* greaterEqual  = 0xd */ ,
  84     Assembler::greater        /* lessEqual     = 0xe */ ,
  85     Assembler::lessEqual      /* greater       = 0xf, */
  86 
  87 };
  88 
  89 
  90 // Implementation of MacroAssembler
  91 
  92 // First all the versions that have distinct versions depending on 32/64 bit
  93 // Unless the difference is trivial (1 line or so).
  94 
  95 #ifndef _LP64
  96 
  97 // 32bit versions
  98 
  99 Address MacroAssembler::as_Address(AddressLiteral adr) {
 100   return Address(adr.target(), adr.rspec());
 101 }
 102 
 103 Address MacroAssembler::as_Address(ArrayAddress adr) {
 104   return Address::make_array(adr);
 105 }
 106 
 107 void MacroAssembler::call_VM_leaf_base(address entry_point,
 108                                        int number_of_arguments) {
 109   call(RuntimeAddress(entry_point));
 110   increment(rsp, number_of_arguments * wordSize);
 111 }
 112 
 113 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
 114   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 115 }
 116 
 117 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
 118   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 119 }
 120 
 121 void MacroAssembler::cmpoop_raw(Address src1, jobject obj) {
 122   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 123 }
 124 
 125 void MacroAssembler::cmpoop_raw(Register src1, jobject obj) {
 126   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 127 }
 128 
 129 void MacroAssembler::cmpoop(Address src1, jobject obj) {
 130   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 131   bs->obj_equals(this, src1, obj);
 132 }
 133 
 134 void MacroAssembler::cmpoop(Register src1, jobject obj) {
 135   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 136   bs->obj_equals(this, src1, obj);
 137 }
 138 
 139 void MacroAssembler::extend_sign(Register hi, Register lo) {
 140   // According to Intel Doc. AP-526, "Integer Divide", p.18.
 141   if (VM_Version::is_P6() && hi == rdx && lo == rax) {
 142     cdql();
 143   } else {
 144     movl(hi, lo);
 145     sarl(hi, 31);
 146   }
 147 }
 148 
 149 void MacroAssembler::jC2(Register tmp, Label& L) {
 150   // set parity bit if FPU flag C2 is set (via rax)
 151   save_rax(tmp);
 152   fwait(); fnstsw_ax();
 153   sahf();
 154   restore_rax(tmp);
 155   // branch
 156   jcc(Assembler::parity, L);
 157 }
 158 
 159 void MacroAssembler::jnC2(Register tmp, Label& L) {
 160   // set parity bit if FPU flag C2 is set (via rax)
 161   save_rax(tmp);
 162   fwait(); fnstsw_ax();
 163   sahf();
 164   restore_rax(tmp);
 165   // branch
 166   jcc(Assembler::noParity, L);
 167 }
 168 
 169 // 32bit can do a case table jump in one instruction but we no longer allow the base
 170 // to be installed in the Address class
 171 void MacroAssembler::jump(ArrayAddress entry) {
 172   jmp(as_Address(entry));
 173 }
 174 
 175 // Note: y_lo will be destroyed
 176 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 177   // Long compare for Java (semantics as described in JVM spec.)
 178   Label high, low, done;
 179 
 180   cmpl(x_hi, y_hi);
 181   jcc(Assembler::less, low);
 182   jcc(Assembler::greater, high);
 183   // x_hi is the return register
 184   xorl(x_hi, x_hi);
 185   cmpl(x_lo, y_lo);
 186   jcc(Assembler::below, low);
 187   jcc(Assembler::equal, done);
 188 
 189   bind(high);
 190   xorl(x_hi, x_hi);
 191   increment(x_hi);
 192   jmp(done);
 193 
 194   bind(low);
 195   xorl(x_hi, x_hi);
 196   decrementl(x_hi);
 197 
 198   bind(done);
 199 }
 200 
 201 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 202     mov_literal32(dst, (int32_t)src.target(), src.rspec());
 203 }
 204 
 205 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 206   // leal(dst, as_Address(adr));
 207   // see note in movl as to why we must use a move
 208   mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
 209 }
 210 
 211 void MacroAssembler::leave() {
 212   mov(rsp, rbp);
 213   pop(rbp);
 214 }
 215 
 216 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
 217   // Multiplication of two Java long values stored on the stack
 218   // as illustrated below. Result is in rdx:rax.
 219   //
 220   // rsp ---> [  ??  ] \               \
 221   //            ....    | y_rsp_offset  |
 222   //          [ y_lo ] /  (in bytes)    | x_rsp_offset
 223   //          [ y_hi ]                  | (in bytes)
 224   //            ....                    |
 225   //          [ x_lo ]                 /
 226   //          [ x_hi ]
 227   //            ....
 228   //
 229   // Basic idea: lo(result) = lo(x_lo * y_lo)
 230   //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
 231   Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
 232   Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
 233   Label quick;
 234   // load x_hi, y_hi and check if quick
 235   // multiplication is possible
 236   movl(rbx, x_hi);
 237   movl(rcx, y_hi);
 238   movl(rax, rbx);
 239   orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
 240   jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
 241   // do full multiplication
 242   // 1st step
 243   mull(y_lo);                                    // x_hi * y_lo
 244   movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
 245   // 2nd step
 246   movl(rax, x_lo);
 247   mull(rcx);                                     // x_lo * y_hi
 248   addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
 249   // 3rd step
 250   bind(quick);                                   // note: rbx, = 0 if quick multiply!
 251   movl(rax, x_lo);
 252   mull(y_lo);                                    // x_lo * y_lo
 253   addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
 254 }
 255 
 256 void MacroAssembler::lneg(Register hi, Register lo) {
 257   negl(lo);
 258   adcl(hi, 0);
 259   negl(hi);
 260 }
 261 
 262 void MacroAssembler::lshl(Register hi, Register lo) {
 263   // Java shift left long support (semantics as described in JVM spec., p.305)
 264   // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
 265   // shift value is in rcx !
 266   assert(hi != rcx, "must not use rcx");
 267   assert(lo != rcx, "must not use rcx");
 268   const Register s = rcx;                        // shift count
 269   const int      n = BitsPerWord;
 270   Label L;
 271   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 272   cmpl(s, n);                                    // if (s < n)
 273   jcc(Assembler::less, L);                       // else (s >= n)
 274   movl(hi, lo);                                  // x := x << n
 275   xorl(lo, lo);
 276   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 277   bind(L);                                       // s (mod n) < n
 278   shldl(hi, lo);                                 // x := x << s
 279   shll(lo);
 280 }
 281 
 282 
 283 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
 284   // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
 285   // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
 286   assert(hi != rcx, "must not use rcx");
 287   assert(lo != rcx, "must not use rcx");
 288   const Register s = rcx;                        // shift count
 289   const int      n = BitsPerWord;
 290   Label L;
 291   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 292   cmpl(s, n);                                    // if (s < n)
 293   jcc(Assembler::less, L);                       // else (s >= n)
 294   movl(lo, hi);                                  // x := x >> n
 295   if (sign_extension) sarl(hi, 31);
 296   else                xorl(hi, hi);
 297   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 298   bind(L);                                       // s (mod n) < n
 299   shrdl(lo, hi);                                 // x := x >> s
 300   if (sign_extension) sarl(hi);
 301   else                shrl(hi);
 302 }
 303 
 304 void MacroAssembler::movoop(Register dst, jobject obj) {
 305   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 306 }
 307 
 308 void MacroAssembler::movoop(Address dst, jobject obj) {
 309   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 310 }
 311 
 312 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 313   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 314 }
 315 
 316 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 317   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 318 }
 319 
 320 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 321   // scratch register is not used,
 322   // it is defined to match parameters of 64-bit version of this method.
 323   if (src.is_lval()) {
 324     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
 325   } else {
 326     movl(dst, as_Address(src));
 327   }
 328 }
 329 
 330 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 331   movl(as_Address(dst), src);
 332 }
 333 
 334 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 335   movl(dst, as_Address(src));
 336 }
 337 
 338 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 339 void MacroAssembler::movptr(Address dst, intptr_t src) {
 340   movl(dst, src);
 341 }
 342 
 343 
 344 void MacroAssembler::pop_callee_saved_registers() {
 345   pop(rcx);
 346   pop(rdx);
 347   pop(rdi);
 348   pop(rsi);
 349 }
 350 
 351 void MacroAssembler::pop_fTOS() {
 352   fld_d(Address(rsp, 0));
 353   addl(rsp, 2 * wordSize);
 354 }
 355 
 356 void MacroAssembler::push_callee_saved_registers() {
 357   push(rsi);
 358   push(rdi);
 359   push(rdx);
 360   push(rcx);
 361 }
 362 
 363 void MacroAssembler::push_fTOS() {
 364   subl(rsp, 2 * wordSize);
 365   fstp_d(Address(rsp, 0));
 366 }
 367 
 368 
 369 void MacroAssembler::pushoop(jobject obj) {
 370   push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
 371 }
 372 
 373 void MacroAssembler::pushklass(Metadata* obj) {
 374   push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
 375 }
 376 
 377 void MacroAssembler::pushptr(AddressLiteral src) {
 378   if (src.is_lval()) {
 379     push_literal32((int32_t)src.target(), src.rspec());
 380   } else {
 381     pushl(as_Address(src));
 382   }
 383 }
 384 
 385 void MacroAssembler::set_word_if_not_zero(Register dst) {
 386   xorl(dst, dst);
 387   set_byte_if_not_zero(dst);
 388 }
 389 
 390 static void pass_arg0(MacroAssembler* masm, Register arg) {
 391   masm->push(arg);
 392 }
 393 
 394 static void pass_arg1(MacroAssembler* masm, Register arg) {
 395   masm->push(arg);
 396 }
 397 
 398 static void pass_arg2(MacroAssembler* masm, Register arg) {
 399   masm->push(arg);
 400 }
 401 
 402 static void pass_arg3(MacroAssembler* masm, Register arg) {
 403   masm->push(arg);
 404 }
 405 
 406 #ifndef PRODUCT
 407 extern "C" void findpc(intptr_t x);
 408 #endif
 409 
 410 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
 411   // In order to get locks to work, we need to fake a in_VM state
 412   JavaThread* thread = JavaThread::current();
 413   JavaThreadState saved_state = thread->thread_state();
 414   thread->set_thread_state(_thread_in_vm);
 415   if (ShowMessageBoxOnError) {
 416     JavaThread* thread = JavaThread::current();
 417     JavaThreadState saved_state = thread->thread_state();
 418     thread->set_thread_state(_thread_in_vm);
 419     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 420       ttyLocker ttyl;
 421       BytecodeCounter::print();
 422     }
 423     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 424     // This is the value of eip which points to where verify_oop will return.
 425     if (os::message_box(msg, "Execution stopped, print registers?")) {
 426       print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
 427       BREAKPOINT;
 428     }
 429   } else {
 430     ttyLocker ttyl;
 431     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
 432   }
 433   // Don't assert holding the ttyLock
 434     assert(false, "DEBUG MESSAGE: %s", msg);
 435   ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
 436 }
 437 
 438 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
 439   ttyLocker ttyl;
 440   FlagSetting fs(Debugging, true);
 441   tty->print_cr("eip = 0x%08x", eip);
 442 #ifndef PRODUCT
 443   if ((WizardMode || Verbose) && PrintMiscellaneous) {
 444     tty->cr();
 445     findpc(eip);
 446     tty->cr();
 447   }
 448 #endif
 449 #define PRINT_REG(rax) \
 450   { tty->print("%s = ", #rax); os::print_location(tty, rax); }
 451   PRINT_REG(rax);
 452   PRINT_REG(rbx);
 453   PRINT_REG(rcx);
 454   PRINT_REG(rdx);
 455   PRINT_REG(rdi);
 456   PRINT_REG(rsi);
 457   PRINT_REG(rbp);
 458   PRINT_REG(rsp);
 459 #undef PRINT_REG
 460   // Print some words near top of staack.
 461   int* dump_sp = (int*) rsp;
 462   for (int col1 = 0; col1 < 8; col1++) {
 463     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 464     os::print_location(tty, *dump_sp++);
 465   }
 466   for (int row = 0; row < 16; row++) {
 467     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 468     for (int col = 0; col < 8; col++) {
 469       tty->print(" 0x%08x", *dump_sp++);
 470     }
 471     tty->cr();
 472   }
 473   // Print some instructions around pc:
 474   Disassembler::decode((address)eip-64, (address)eip);
 475   tty->print_cr("--------");
 476   Disassembler::decode((address)eip, (address)eip+32);
 477 }
 478 
 479 void MacroAssembler::stop(const char* msg) {
 480   ExternalAddress message((address)msg);
 481   // push address of message
 482   pushptr(message.addr());
 483   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 484   pusha();                                            // push registers
 485   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
 486   hlt();
 487 }
 488 
 489 void MacroAssembler::warn(const char* msg) {
 490   push_CPU_state();
 491 
 492   ExternalAddress message((address) msg);
 493   // push address of message
 494   pushptr(message.addr());
 495 
 496   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
 497   addl(rsp, wordSize);       // discard argument
 498   pop_CPU_state();
 499 }
 500 
 501 void MacroAssembler::print_state() {
 502   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 503   pusha();                                            // push registers
 504 
 505   push_CPU_state();
 506   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
 507   pop_CPU_state();
 508 
 509   popa();
 510   addl(rsp, wordSize);
 511 }
 512 
 513 #else // _LP64
 514 
 515 // 64 bit versions
 516 
 517 Address MacroAssembler::as_Address(AddressLiteral adr) {
 518   // amd64 always does this as a pc-rel
 519   // we can be absolute or disp based on the instruction type
 520   // jmp/call are displacements others are absolute
 521   assert(!adr.is_lval(), "must be rval");
 522   assert(reachable(adr), "must be");
 523   return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
 524 
 525 }
 526 
 527 Address MacroAssembler::as_Address(ArrayAddress adr) {
 528   AddressLiteral base = adr.base();
 529   lea(rscratch1, base);
 530   Address index = adr.index();
 531   assert(index._disp == 0, "must not have disp"); // maybe it can?
 532   Address array(rscratch1, index._index, index._scale, index._disp);
 533   return array;
 534 }
 535 
 536 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
 537   Label L, E;
 538 
 539 #ifdef _WIN64
 540   // Windows always allocates space for it's register args
 541   assert(num_args <= 4, "only register arguments supported");
 542   subq(rsp,  frame::arg_reg_save_area_bytes);
 543 #endif
 544 
 545   // Align stack if necessary
 546   testl(rsp, 15);
 547   jcc(Assembler::zero, L);
 548 
 549   subq(rsp, 8);
 550   {
 551     call(RuntimeAddress(entry_point));
 552   }
 553   addq(rsp, 8);
 554   jmp(E);
 555 
 556   bind(L);
 557   {
 558     call(RuntimeAddress(entry_point));
 559   }
 560 
 561   bind(E);
 562 
 563 #ifdef _WIN64
 564   // restore stack pointer
 565   addq(rsp, frame::arg_reg_save_area_bytes);
 566 #endif
 567 
 568 }
 569 
 570 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
 571   assert(!src2.is_lval(), "should use cmpptr");
 572 
 573   if (reachable(src2)) {
 574     cmpq(src1, as_Address(src2));
 575   } else {
 576     lea(rscratch1, src2);
 577     Assembler::cmpq(src1, Address(rscratch1, 0));
 578   }
 579 }
 580 
 581 int MacroAssembler::corrected_idivq(Register reg) {
 582   // Full implementation of Java ldiv and lrem; checks for special
 583   // case as described in JVM spec., p.243 & p.271.  The function
 584   // returns the (pc) offset of the idivl instruction - may be needed
 585   // for implicit exceptions.
 586   //
 587   //         normal case                           special case
 588   //
 589   // input : rax: dividend                         min_long
 590   //         reg: divisor   (may not be eax/edx)   -1
 591   //
 592   // output: rax: quotient  (= rax idiv reg)       min_long
 593   //         rdx: remainder (= rax irem reg)       0
 594   assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
 595   static const int64_t min_long = 0x8000000000000000;
 596   Label normal_case, special_case;
 597 
 598   // check for special case
 599   cmp64(rax, ExternalAddress((address) &min_long));
 600   jcc(Assembler::notEqual, normal_case);
 601   xorl(rdx, rdx); // prepare rdx for possible special case (where
 602                   // remainder = 0)
 603   cmpq(reg, -1);
 604   jcc(Assembler::equal, special_case);
 605 
 606   // handle normal case
 607   bind(normal_case);
 608   cdqq();
 609   int idivq_offset = offset();
 610   idivq(reg);
 611 
 612   // normal and special case exit
 613   bind(special_case);
 614 
 615   return idivq_offset;
 616 }
 617 
 618 void MacroAssembler::decrementq(Register reg, int value) {
 619   if (value == min_jint) { subq(reg, value); return; }
 620   if (value <  0) { incrementq(reg, -value); return; }
 621   if (value == 0) {                        ; return; }
 622   if (value == 1 && UseIncDec) { decq(reg) ; return; }
 623   /* else */      { subq(reg, value)       ; return; }
 624 }
 625 
 626 void MacroAssembler::decrementq(Address dst, int value) {
 627   if (value == min_jint) { subq(dst, value); return; }
 628   if (value <  0) { incrementq(dst, -value); return; }
 629   if (value == 0) {                        ; return; }
 630   if (value == 1 && UseIncDec) { decq(dst) ; return; }
 631   /* else */      { subq(dst, value)       ; return; }
 632 }
 633 
 634 void MacroAssembler::incrementq(AddressLiteral dst) {
 635   if (reachable(dst)) {
 636     incrementq(as_Address(dst));
 637   } else {
 638     lea(rscratch1, dst);
 639     incrementq(Address(rscratch1, 0));
 640   }
 641 }
 642 
 643 void MacroAssembler::incrementq(Register reg, int value) {
 644   if (value == min_jint) { addq(reg, value); return; }
 645   if (value <  0) { decrementq(reg, -value); return; }
 646   if (value == 0) {                        ; return; }
 647   if (value == 1 && UseIncDec) { incq(reg) ; return; }
 648   /* else */      { addq(reg, value)       ; return; }
 649 }
 650 
 651 void MacroAssembler::incrementq(Address dst, int value) {
 652   if (value == min_jint) { addq(dst, value); return; }
 653   if (value <  0) { decrementq(dst, -value); return; }
 654   if (value == 0) {                        ; return; }
 655   if (value == 1 && UseIncDec) { incq(dst) ; return; }
 656   /* else */      { addq(dst, value)       ; return; }
 657 }
 658 
 659 // 32bit can do a case table jump in one instruction but we no longer allow the base
 660 // to be installed in the Address class
 661 void MacroAssembler::jump(ArrayAddress entry) {
 662   lea(rscratch1, entry.base());
 663   Address dispatch = entry.index();
 664   assert(dispatch._base == noreg, "must be");
 665   dispatch._base = rscratch1;
 666   jmp(dispatch);
 667 }
 668 
 669 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 670   ShouldNotReachHere(); // 64bit doesn't use two regs
 671   cmpq(x_lo, y_lo);
 672 }
 673 
 674 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 675     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 676 }
 677 
 678 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 679   mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
 680   movptr(dst, rscratch1);
 681 }
 682 
 683 void MacroAssembler::leave() {
 684   // %%% is this really better? Why not on 32bit too?
 685   emit_int8((unsigned char)0xC9); // LEAVE
 686 }
 687 
 688 void MacroAssembler::lneg(Register hi, Register lo) {
 689   ShouldNotReachHere(); // 64bit doesn't use two regs
 690   negq(lo);
 691 }
 692 
 693 void MacroAssembler::movoop(Register dst, jobject obj) {
 694   mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 695 }
 696 
 697 void MacroAssembler::movoop(Address dst, jobject obj) {
 698   mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 699   movq(dst, rscratch1);
 700 }
 701 
 702 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 703   mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 704 }
 705 
 706 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 707   mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 708   movq(dst, rscratch1);
 709 }
 710 
 711 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 712   if (src.is_lval()) {
 713     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 714   } else {
 715     if (reachable(src)) {
 716       movq(dst, as_Address(src));
 717     } else {
 718       lea(scratch, src);
 719       movq(dst, Address(scratch, 0));
 720     }
 721   }
 722 }
 723 
 724 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 725   movq(as_Address(dst), src);
 726 }
 727 
 728 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 729   movq(dst, as_Address(src));
 730 }
 731 
 732 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 733 void MacroAssembler::movptr(Address dst, intptr_t src) {
 734   mov64(rscratch1, src);
 735   movq(dst, rscratch1);
 736 }
 737 
 738 // These are mostly for initializing NULL
 739 void MacroAssembler::movptr(Address dst, int32_t src) {
 740   movslq(dst, src);
 741 }
 742 
 743 void MacroAssembler::movptr(Register dst, int32_t src) {
 744   mov64(dst, (intptr_t)src);
 745 }
 746 
 747 void MacroAssembler::pushoop(jobject obj) {
 748   movoop(rscratch1, obj);
 749   push(rscratch1);
 750 }
 751 
 752 void MacroAssembler::pushklass(Metadata* obj) {
 753   mov_metadata(rscratch1, obj);
 754   push(rscratch1);
 755 }
 756 
 757 void MacroAssembler::pushptr(AddressLiteral src) {
 758   lea(rscratch1, src);
 759   if (src.is_lval()) {
 760     push(rscratch1);
 761   } else {
 762     pushq(Address(rscratch1, 0));
 763   }
 764 }
 765 
 766 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 767   // we must set sp to zero to clear frame
 768   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
 769   // must clear fp, so that compiled frames are not confused; it is
 770   // possible that we need it only for debugging
 771   if (clear_fp) {
 772     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
 773   }
 774 
 775   // Always clear the pc because it could have been set by make_walkable()
 776   movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
 777   vzeroupper();
 778 }
 779 
 780 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 781                                          Register last_java_fp,
 782                                          address  last_java_pc) {
 783   vzeroupper();
 784   // determine last_java_sp register
 785   if (!last_java_sp->is_valid()) {
 786     last_java_sp = rsp;
 787   }
 788 
 789   // last_java_fp is optional
 790   if (last_java_fp->is_valid()) {
 791     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
 792            last_java_fp);
 793   }
 794 
 795   // last_java_pc is optional
 796   if (last_java_pc != NULL) {
 797     Address java_pc(r15_thread,
 798                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
 799     lea(rscratch1, InternalAddress(last_java_pc));
 800     movptr(java_pc, rscratch1);
 801   }
 802 
 803   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
 804 }
 805 
 806 static void pass_arg0(MacroAssembler* masm, Register arg) {
 807   if (c_rarg0 != arg ) {
 808     masm->mov(c_rarg0, arg);
 809   }
 810 }
 811 
 812 static void pass_arg1(MacroAssembler* masm, Register arg) {
 813   if (c_rarg1 != arg ) {
 814     masm->mov(c_rarg1, arg);
 815   }
 816 }
 817 
 818 static void pass_arg2(MacroAssembler* masm, Register arg) {
 819   if (c_rarg2 != arg ) {
 820     masm->mov(c_rarg2, arg);
 821   }
 822 }
 823 
 824 static void pass_arg3(MacroAssembler* masm, Register arg) {
 825   if (c_rarg3 != arg ) {
 826     masm->mov(c_rarg3, arg);
 827   }
 828 }
 829 
 830 void MacroAssembler::stop(const char* msg) {
 831   address rip = pc();
 832   pusha(); // get regs on stack
 833   lea(c_rarg0, ExternalAddress((address) msg));
 834   lea(c_rarg1, InternalAddress(rip));
 835   movq(c_rarg2, rsp); // pass pointer to regs array
 836   andq(rsp, -16); // align stack as required by ABI
 837   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
 838   hlt();
 839 }
 840 
 841 void MacroAssembler::warn(const char* msg) {
 842   push(rbp);
 843   movq(rbp, rsp);
 844   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 845   push_CPU_state();   // keeps alignment at 16 bytes
 846   lea(c_rarg0, ExternalAddress((address) msg));
 847   lea(rax, ExternalAddress(CAST_FROM_FN_PTR(address, warning)));
 848   call(rax);
 849   pop_CPU_state();
 850   mov(rsp, rbp);
 851   pop(rbp);
 852 }
 853 
 854 void MacroAssembler::print_state() {
 855   address rip = pc();
 856   pusha();            // get regs on stack
 857   push(rbp);
 858   movq(rbp, rsp);
 859   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 860   push_CPU_state();   // keeps alignment at 16 bytes
 861 
 862   lea(c_rarg0, InternalAddress(rip));
 863   lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
 864   call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
 865 
 866   pop_CPU_state();
 867   mov(rsp, rbp);
 868   pop(rbp);
 869   popa();
 870 }
 871 
 872 #ifndef PRODUCT
 873 extern "C" void findpc(intptr_t x);
 874 #endif
 875 
 876 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
 877   // In order to get locks to work, we need to fake a in_VM state
 878   if (ShowMessageBoxOnError) {
 879     JavaThread* thread = JavaThread::current();
 880     JavaThreadState saved_state = thread->thread_state();
 881     thread->set_thread_state(_thread_in_vm);
 882 #ifndef PRODUCT
 883     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 884       ttyLocker ttyl;
 885       BytecodeCounter::print();
 886     }
 887 #endif
 888     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 889     // XXX correct this offset for amd64
 890     // This is the value of eip which points to where verify_oop will return.
 891     if (os::message_box(msg, "Execution stopped, print registers?")) {
 892       print_state64(pc, regs);
 893       BREAKPOINT;
 894       assert(false, "start up GDB");
 895     }
 896     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
 897   } else {
 898     ttyLocker ttyl;
 899     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
 900                     msg);
 901     assert(false, "DEBUG MESSAGE: %s", msg);
 902   }
 903 }
 904 
 905 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
 906   ttyLocker ttyl;
 907   FlagSetting fs(Debugging, true);
 908   tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
 909 #ifndef PRODUCT
 910   tty->cr();
 911   findpc(pc);
 912   tty->cr();
 913 #endif
 914 #define PRINT_REG(rax, value) \
 915   { tty->print("%s = ", #rax); os::print_location(tty, value); }
 916   PRINT_REG(rax, regs[15]);
 917   PRINT_REG(rbx, regs[12]);
 918   PRINT_REG(rcx, regs[14]);
 919   PRINT_REG(rdx, regs[13]);
 920   PRINT_REG(rdi, regs[8]);
 921   PRINT_REG(rsi, regs[9]);
 922   PRINT_REG(rbp, regs[10]);
 923   PRINT_REG(rsp, regs[11]);
 924   PRINT_REG(r8 , regs[7]);
 925   PRINT_REG(r9 , regs[6]);
 926   PRINT_REG(r10, regs[5]);
 927   PRINT_REG(r11, regs[4]);
 928   PRINT_REG(r12, regs[3]);
 929   PRINT_REG(r13, regs[2]);
 930   PRINT_REG(r14, regs[1]);
 931   PRINT_REG(r15, regs[0]);
 932 #undef PRINT_REG
 933   // Print some words near top of staack.
 934   int64_t* rsp = (int64_t*) regs[11];
 935   int64_t* dump_sp = rsp;
 936   for (int col1 = 0; col1 < 8; col1++) {
 937     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 938     os::print_location(tty, *dump_sp++);
 939   }
 940   for (int row = 0; row < 25; row++) {
 941     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 942     for (int col = 0; col < 4; col++) {
 943       tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
 944     }
 945     tty->cr();
 946   }
 947   // Print some instructions around pc:
 948   Disassembler::decode((address)pc-64, (address)pc);
 949   tty->print_cr("--------");
 950   Disassembler::decode((address)pc, (address)pc+32);
 951 }
 952 
 953 #endif // _LP64
 954 
 955 // Now versions that are common to 32/64 bit
 956 
 957 void MacroAssembler::addptr(Register dst, int32_t imm32) {
 958   LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
 959 }
 960 
 961 void MacroAssembler::addptr(Register dst, Register src) {
 962   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
 963 }
 964 
 965 void MacroAssembler::addptr(Address dst, Register src) {
 966   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
 967 }
 968 
 969 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
 970   if (reachable(src)) {
 971     Assembler::addsd(dst, as_Address(src));
 972   } else {
 973     lea(rscratch1, src);
 974     Assembler::addsd(dst, Address(rscratch1, 0));
 975   }
 976 }
 977 
 978 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
 979   if (reachable(src)) {
 980     addss(dst, as_Address(src));
 981   } else {
 982     lea(rscratch1, src);
 983     addss(dst, Address(rscratch1, 0));
 984   }
 985 }
 986 
 987 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src) {
 988   if (reachable(src)) {
 989     Assembler::addpd(dst, as_Address(src));
 990   } else {
 991     lea(rscratch1, src);
 992     Assembler::addpd(dst, Address(rscratch1, 0));
 993   }
 994 }
 995 
 996 void MacroAssembler::align(int modulus) {
 997   align(modulus, offset());
 998 }
 999 
1000 void MacroAssembler::align(int modulus, int target) {
1001   if (target % modulus != 0) {
1002     nop(modulus - (target % modulus));
1003   }
1004 }
1005 
1006 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src) {
1007   // Used in sign-masking with aligned address.
1008   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1009   if (reachable(src)) {
1010     Assembler::andpd(dst, as_Address(src));
1011   } else {
1012     lea(rscratch1, src);
1013     Assembler::andpd(dst, Address(rscratch1, 0));
1014   }
1015 }
1016 
1017 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src) {
1018   // Used in sign-masking with aligned address.
1019   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1020   if (reachable(src)) {
1021     Assembler::andps(dst, as_Address(src));
1022   } else {
1023     lea(rscratch1, src);
1024     Assembler::andps(dst, Address(rscratch1, 0));
1025   }
1026 }
1027 
1028 void MacroAssembler::andptr(Register dst, int32_t imm32) {
1029   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1030 }
1031 
1032 void MacroAssembler::atomic_incl(Address counter_addr) {
1033   if (os::is_MP())
1034     lock();
1035   incrementl(counter_addr);
1036 }
1037 
1038 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1039   if (reachable(counter_addr)) {
1040     atomic_incl(as_Address(counter_addr));
1041   } else {
1042     lea(scr, counter_addr);
1043     atomic_incl(Address(scr, 0));
1044   }
1045 }
1046 
1047 #ifdef _LP64
1048 void MacroAssembler::atomic_incq(Address counter_addr) {
1049   if (os::is_MP())
1050     lock();
1051   incrementq(counter_addr);
1052 }
1053 
1054 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1055   if (reachable(counter_addr)) {
1056     atomic_incq(as_Address(counter_addr));
1057   } else {
1058     lea(scr, counter_addr);
1059     atomic_incq(Address(scr, 0));
1060   }
1061 }
1062 #endif
1063 
1064 // Writes to stack successive pages until offset reached to check for
1065 // stack overflow + shadow pages.  This clobbers tmp.
1066 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1067   movptr(tmp, rsp);
1068   // Bang stack for total size given plus shadow page size.
1069   // Bang one page at a time because large size can bang beyond yellow and
1070   // red zones.
1071   Label loop;
1072   bind(loop);
1073   movl(Address(tmp, (-os::vm_page_size())), size );
1074   subptr(tmp, os::vm_page_size());
1075   subl(size, os::vm_page_size());
1076   jcc(Assembler::greater, loop);
1077 
1078   // Bang down shadow pages too.
1079   // At this point, (tmp-0) is the last address touched, so don't
1080   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1081   // was post-decremented.)  Skip this address by starting at i=1, and
1082   // touch a few more pages below.  N.B.  It is important to touch all
1083   // the way down including all pages in the shadow zone.
1084   for (int i = 1; i < ((int)JavaThread::stack_shadow_zone_size() / os::vm_page_size()); i++) {
1085     // this could be any sized move but this is can be a debugging crumb
1086     // so the bigger the better.
1087     movptr(Address(tmp, (-i*os::vm_page_size())), size );
1088   }
1089 }
1090 
1091 void MacroAssembler::reserved_stack_check() {
1092     // testing if reserved zone needs to be enabled
1093     Label no_reserved_zone_enabling;
1094     Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1095     NOT_LP64(get_thread(rsi);)
1096 
1097     cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1098     jcc(Assembler::below, no_reserved_zone_enabling);
1099 
1100     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1101     jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1102     should_not_reach_here();
1103 
1104     bind(no_reserved_zone_enabling);
1105 }
1106 
1107 int MacroAssembler::biased_locking_enter(Register lock_reg,
1108                                          Register obj_reg,
1109                                          Register swap_reg,
1110                                          Register tmp_reg,
1111                                          bool swap_reg_contains_mark,
1112                                          Label& done,
1113                                          Label* slow_case,
1114                                          BiasedLockingCounters* counters) {
1115   assert(UseBiasedLocking, "why call this otherwise?");
1116   assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
1117   assert(tmp_reg != noreg, "tmp_reg must be supplied");
1118   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
1119   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
1120   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
1121   NOT_LP64( Address saved_mark_addr(lock_reg, 0); )
1122 
1123   if (PrintBiasedLockingStatistics && counters == NULL) {
1124     counters = BiasedLocking::counters();
1125   }
1126   // Biased locking
1127   // See whether the lock is currently biased toward our thread and
1128   // whether the epoch is still valid
1129   // Note that the runtime guarantees sufficient alignment of JavaThread
1130   // pointers to allow age to be placed into low bits
1131   // First check to see whether biasing is even enabled for this object
1132   Label cas_label;
1133   int null_check_offset = -1;
1134   if (!swap_reg_contains_mark) {
1135     null_check_offset = offset();
1136     movptr(swap_reg, mark_addr);
1137   }
1138   movptr(tmp_reg, swap_reg);
1139   andptr(tmp_reg, markOopDesc::biased_lock_mask_in_place);
1140   cmpptr(tmp_reg, markOopDesc::biased_lock_pattern);
1141   jcc(Assembler::notEqual, cas_label);
1142   // The bias pattern is present in the object's header. Need to check
1143   // whether the bias owner and the epoch are both still current.
1144 #ifndef _LP64
1145   // Note that because there is no current thread register on x86_32 we
1146   // need to store off the mark word we read out of the object to
1147   // avoid reloading it and needing to recheck invariants below. This
1148   // store is unfortunate but it makes the overall code shorter and
1149   // simpler.
1150   movptr(saved_mark_addr, swap_reg);
1151 #endif
1152   if (swap_reg_contains_mark) {
1153     null_check_offset = offset();
1154   }
1155   load_prototype_header(tmp_reg, obj_reg);
1156 #ifdef _LP64
1157   orptr(tmp_reg, r15_thread);
1158   xorptr(tmp_reg, swap_reg);
1159   Register header_reg = tmp_reg;
1160 #else
1161   xorptr(tmp_reg, swap_reg);
1162   get_thread(swap_reg);
1163   xorptr(swap_reg, tmp_reg);
1164   Register header_reg = swap_reg;
1165 #endif
1166   andptr(header_reg, ~((int) markOopDesc::age_mask_in_place));
1167   if (counters != NULL) {
1168     cond_inc32(Assembler::zero,
1169                ExternalAddress((address) counters->biased_lock_entry_count_addr()));
1170   }
1171   jcc(Assembler::equal, done);
1172 
1173   Label try_revoke_bias;
1174   Label try_rebias;
1175 
1176   // At this point we know that the header has the bias pattern and
1177   // that we are not the bias owner in the current epoch. We need to
1178   // figure out more details about the state of the header in order to
1179   // know what operations can be legally performed on the object's
1180   // header.
1181 
1182   // If the low three bits in the xor result aren't clear, that means
1183   // the prototype header is no longer biased and we have to revoke
1184   // the bias on this object.
1185   testptr(header_reg, markOopDesc::biased_lock_mask_in_place);
1186   jccb(Assembler::notZero, try_revoke_bias);
1187 
1188   // Biasing is still enabled for this data type. See whether the
1189   // epoch of the current bias is still valid, meaning that the epoch
1190   // bits of the mark word are equal to the epoch bits of the
1191   // prototype header. (Note that the prototype header's epoch bits
1192   // only change at a safepoint.) If not, attempt to rebias the object
1193   // toward the current thread. Note that we must be absolutely sure
1194   // that the current epoch is invalid in order to do this because
1195   // otherwise the manipulations it performs on the mark word are
1196   // illegal.
1197   testptr(header_reg, markOopDesc::epoch_mask_in_place);
1198   jccb(Assembler::notZero, try_rebias);
1199 
1200   // The epoch of the current bias is still valid but we know nothing
1201   // about the owner; it might be set or it might be clear. Try to
1202   // acquire the bias of the object using an atomic operation. If this
1203   // fails we will go in to the runtime to revoke the object's bias.
1204   // Note that we first construct the presumed unbiased header so we
1205   // don't accidentally blow away another thread's valid bias.
1206   NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1207   andptr(swap_reg,
1208          markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
1209 #ifdef _LP64
1210   movptr(tmp_reg, swap_reg);
1211   orptr(tmp_reg, r15_thread);
1212 #else
1213   get_thread(tmp_reg);
1214   orptr(tmp_reg, swap_reg);
1215 #endif
1216   if (os::is_MP()) {
1217     lock();
1218   }
1219   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1220   // If the biasing toward our thread failed, this means that
1221   // another thread succeeded in biasing it toward itself and we
1222   // need to revoke that bias. The revocation will occur in the
1223   // interpreter runtime in the slow case.
1224   if (counters != NULL) {
1225     cond_inc32(Assembler::zero,
1226                ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
1227   }
1228   if (slow_case != NULL) {
1229     jcc(Assembler::notZero, *slow_case);
1230   }
1231   jmp(done);
1232 
1233   bind(try_rebias);
1234   // At this point we know the epoch has expired, meaning that the
1235   // current "bias owner", if any, is actually invalid. Under these
1236   // circumstances _only_, we are allowed to use the current header's
1237   // value as the comparison value when doing the cas to acquire the
1238   // bias in the current epoch. In other words, we allow transfer of
1239   // the bias from one thread to another directly in this situation.
1240   //
1241   // FIXME: due to a lack of registers we currently blow away the age
1242   // bits in this situation. Should attempt to preserve them.
1243   load_prototype_header(tmp_reg, obj_reg);
1244 #ifdef _LP64
1245   orptr(tmp_reg, r15_thread);
1246 #else
1247   get_thread(swap_reg);
1248   orptr(tmp_reg, swap_reg);
1249   movptr(swap_reg, saved_mark_addr);
1250 #endif
1251   if (os::is_MP()) {
1252     lock();
1253   }
1254   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1255   // If the biasing toward our thread failed, then another thread
1256   // succeeded in biasing it toward itself and we need to revoke that
1257   // bias. The revocation will occur in the runtime in the slow case.
1258   if (counters != NULL) {
1259     cond_inc32(Assembler::zero,
1260                ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
1261   }
1262   if (slow_case != NULL) {
1263     jcc(Assembler::notZero, *slow_case);
1264   }
1265   jmp(done);
1266 
1267   bind(try_revoke_bias);
1268   // The prototype mark in the klass doesn't have the bias bit set any
1269   // more, indicating that objects of this data type are not supposed
1270   // to be biased any more. We are going to try to reset the mark of
1271   // this object to the prototype value and fall through to the
1272   // CAS-based locking scheme. Note that if our CAS fails, it means
1273   // that another thread raced us for the privilege of revoking the
1274   // bias of this particular object, so it's okay to continue in the
1275   // normal locking code.
1276   //
1277   // FIXME: due to a lack of registers we currently blow away the age
1278   // bits in this situation. Should attempt to preserve them.
1279   NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1280   load_prototype_header(tmp_reg, obj_reg);
1281   if (os::is_MP()) {
1282     lock();
1283   }
1284   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1285   // Fall through to the normal CAS-based lock, because no matter what
1286   // the result of the above CAS, some thread must have succeeded in
1287   // removing the bias bit from the object's header.
1288   if (counters != NULL) {
1289     cond_inc32(Assembler::zero,
1290                ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
1291   }
1292 
1293   bind(cas_label);
1294 
1295   return null_check_offset;
1296 }
1297 
1298 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1299   assert(UseBiasedLocking, "why call this otherwise?");
1300 
1301   // Check for biased locking unlock case, which is a no-op
1302   // Note: we do not have to check the thread ID for two reasons.
1303   // First, the interpreter checks for IllegalMonitorStateException at
1304   // a higher level. Second, if the bias was revoked while we held the
1305   // lock, the object could not be rebiased toward another thread, so
1306   // the bias bit would be clear.
1307   movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1308   andptr(temp_reg, markOopDesc::biased_lock_mask_in_place);
1309   cmpptr(temp_reg, markOopDesc::biased_lock_pattern);
1310   jcc(Assembler::equal, done);
1311 }
1312 
1313 #ifdef COMPILER2
1314 
1315 #if INCLUDE_RTM_OPT
1316 
1317 // Update rtm_counters based on abort status
1318 // input: abort_status
1319 //        rtm_counters (RTMLockingCounters*)
1320 // flags are killed
1321 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
1322 
1323   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
1324   if (PrintPreciseRTMLockingStatistics) {
1325     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
1326       Label check_abort;
1327       testl(abort_status, (1<<i));
1328       jccb(Assembler::equal, check_abort);
1329       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
1330       bind(check_abort);
1331     }
1332   }
1333 }
1334 
1335 // Branch if (random & (count-1) != 0), count is 2^n
1336 // tmp, scr and flags are killed
1337 void MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
1338   assert(tmp == rax, "");
1339   assert(scr == rdx, "");
1340   rdtsc(); // modifies EDX:EAX
1341   andptr(tmp, count-1);
1342   jccb(Assembler::notZero, brLabel);
1343 }
1344 
1345 // Perform abort ratio calculation, set no_rtm bit if high ratio
1346 // input:  rtm_counters_Reg (RTMLockingCounters* address)
1347 // tmpReg, rtm_counters_Reg and flags are killed
1348 void MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
1349                                                  Register rtm_counters_Reg,
1350                                                  RTMLockingCounters* rtm_counters,
1351                                                  Metadata* method_data) {
1352   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
1353 
1354   if (RTMLockingCalculationDelay > 0) {
1355     // Delay calculation
1356     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
1357     testptr(tmpReg, tmpReg);
1358     jccb(Assembler::equal, L_done);
1359   }
1360   // Abort ratio calculation only if abort_count > RTMAbortThreshold
1361   //   Aborted transactions = abort_count * 100
1362   //   All transactions = total_count *  RTMTotalCountIncrRate
1363   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
1364 
1365   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
1366   cmpptr(tmpReg, RTMAbortThreshold);
1367   jccb(Assembler::below, L_check_always_rtm2);
1368   imulptr(tmpReg, tmpReg, 100);
1369 
1370   Register scrReg = rtm_counters_Reg;
1371   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1372   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
1373   imulptr(scrReg, scrReg, RTMAbortRatio);
1374   cmpptr(tmpReg, scrReg);
1375   jccb(Assembler::below, L_check_always_rtm1);
1376   if (method_data != NULL) {
1377     // set rtm_state to "no rtm" in MDO
1378     mov_metadata(tmpReg, method_data);
1379     if (os::is_MP()) {
1380       lock();
1381     }
1382     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
1383   }
1384   jmpb(L_done);
1385   bind(L_check_always_rtm1);
1386   // Reload RTMLockingCounters* address
1387   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1388   bind(L_check_always_rtm2);
1389   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1390   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
1391   jccb(Assembler::below, L_done);
1392   if (method_data != NULL) {
1393     // set rtm_state to "always rtm" in MDO
1394     mov_metadata(tmpReg, method_data);
1395     if (os::is_MP()) {
1396       lock();
1397     }
1398     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
1399   }
1400   bind(L_done);
1401 }
1402 
1403 // Update counters and perform abort ratio calculation
1404 // input:  abort_status_Reg
1405 // rtm_counters_Reg, flags are killed
1406 void MacroAssembler::rtm_profiling(Register abort_status_Reg,
1407                                    Register rtm_counters_Reg,
1408                                    RTMLockingCounters* rtm_counters,
1409                                    Metadata* method_data,
1410                                    bool profile_rtm) {
1411 
1412   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1413   // update rtm counters based on rax value at abort
1414   // reads abort_status_Reg, updates flags
1415   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1416   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
1417   if (profile_rtm) {
1418     // Save abort status because abort_status_Reg is used by following code.
1419     if (RTMRetryCount > 0) {
1420       push(abort_status_Reg);
1421     }
1422     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1423     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
1424     // restore abort status
1425     if (RTMRetryCount > 0) {
1426       pop(abort_status_Reg);
1427     }
1428   }
1429 }
1430 
1431 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
1432 // inputs: retry_count_Reg
1433 //       : abort_status_Reg
1434 // output: retry_count_Reg decremented by 1
1435 // flags are killed
1436 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
1437   Label doneRetry;
1438   assert(abort_status_Reg == rax, "");
1439   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
1440   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
1441   // if reason is in 0x6 and retry count != 0 then retry
1442   andptr(abort_status_Reg, 0x6);
1443   jccb(Assembler::zero, doneRetry);
1444   testl(retry_count_Reg, retry_count_Reg);
1445   jccb(Assembler::zero, doneRetry);
1446   pause();
1447   decrementl(retry_count_Reg);
1448   jmp(retryLabel);
1449   bind(doneRetry);
1450 }
1451 
1452 // Spin and retry if lock is busy,
1453 // inputs: box_Reg (monitor address)
1454 //       : retry_count_Reg
1455 // output: retry_count_Reg decremented by 1
1456 //       : clear z flag if retry count exceeded
1457 // tmp_Reg, scr_Reg, flags are killed
1458 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
1459                                             Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
1460   Label SpinLoop, SpinExit, doneRetry;
1461   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1462 
1463   testl(retry_count_Reg, retry_count_Reg);
1464   jccb(Assembler::zero, doneRetry);
1465   decrementl(retry_count_Reg);
1466   movptr(scr_Reg, RTMSpinLoopCount);
1467 
1468   bind(SpinLoop);
1469   pause();
1470   decrementl(scr_Reg);
1471   jccb(Assembler::lessEqual, SpinExit);
1472   movptr(tmp_Reg, Address(box_Reg, owner_offset));
1473   testptr(tmp_Reg, tmp_Reg);
1474   jccb(Assembler::notZero, SpinLoop);
1475 
1476   bind(SpinExit);
1477   jmp(retryLabel);
1478   bind(doneRetry);
1479   incrementl(retry_count_Reg); // clear z flag
1480 }
1481 
1482 // Use RTM for normal stack locks
1483 // Input: objReg (object to lock)
1484 void MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
1485                                        Register retry_on_abort_count_Reg,
1486                                        RTMLockingCounters* stack_rtm_counters,
1487                                        Metadata* method_data, bool profile_rtm,
1488                                        Label& DONE_LABEL, Label& IsInflated) {
1489   assert(UseRTMForStackLocks, "why call this otherwise?");
1490   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1491   assert(tmpReg == rax, "");
1492   assert(scrReg == rdx, "");
1493   Label L_rtm_retry, L_decrement_retry, L_on_abort;
1494 
1495   if (RTMRetryCount > 0) {
1496     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1497     bind(L_rtm_retry);
1498   }
1499   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
1500   testptr(tmpReg, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
1501   jcc(Assembler::notZero, IsInflated);
1502 
1503   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1504     Label L_noincrement;
1505     if (RTMTotalCountIncrRate > 1) {
1506       // tmpReg, scrReg and flags are killed
1507       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
1508     }
1509     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
1510     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
1511     bind(L_noincrement);
1512   }
1513   xbegin(L_on_abort);
1514   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
1515   andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
1516   cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
1517   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
1518 
1519   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1520   if (UseRTMXendForLockBusy) {
1521     xend();
1522     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
1523     jmp(L_decrement_retry);
1524   }
1525   else {
1526     xabort(0);
1527   }
1528   bind(L_on_abort);
1529   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1530     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
1531   }
1532   bind(L_decrement_retry);
1533   if (RTMRetryCount > 0) {
1534     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1535     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1536   }
1537 }
1538 
1539 // Use RTM for inflating locks
1540 // inputs: objReg (object to lock)
1541 //         boxReg (on-stack box address (displaced header location) - KILLED)
1542 //         tmpReg (ObjectMonitor address + markOopDesc::monitor_value)
1543 void MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
1544                                           Register scrReg, Register retry_on_busy_count_Reg,
1545                                           Register retry_on_abort_count_Reg,
1546                                           RTMLockingCounters* rtm_counters,
1547                                           Metadata* method_data, bool profile_rtm,
1548                                           Label& DONE_LABEL) {
1549   assert(UseRTMLocking, "why call this otherwise?");
1550   assert(tmpReg == rax, "");
1551   assert(scrReg == rdx, "");
1552   Label L_rtm_retry, L_decrement_retry, L_on_abort;
1553   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1554 
1555   // Without cast to int32_t a movptr will destroy r10 which is typically obj
1556   movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1557   movptr(boxReg, tmpReg); // Save ObjectMonitor address
1558 
1559   if (RTMRetryCount > 0) {
1560     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
1561     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1562     bind(L_rtm_retry);
1563   }
1564   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1565     Label L_noincrement;
1566     if (RTMTotalCountIncrRate > 1) {
1567       // tmpReg, scrReg and flags are killed
1568       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
1569     }
1570     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1571     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
1572     bind(L_noincrement);
1573   }
1574   xbegin(L_on_abort);
1575   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
1576   movptr(tmpReg, Address(tmpReg, owner_offset));
1577   testptr(tmpReg, tmpReg);
1578   jcc(Assembler::zero, DONE_LABEL);
1579   if (UseRTMXendForLockBusy) {
1580     xend();
1581     jmp(L_decrement_retry);
1582   }
1583   else {
1584     xabort(0);
1585   }
1586   bind(L_on_abort);
1587   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1588   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1589     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
1590   }
1591   if (RTMRetryCount > 0) {
1592     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1593     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1594   }
1595 
1596   movptr(tmpReg, Address(boxReg, owner_offset)) ;
1597   testptr(tmpReg, tmpReg) ;
1598   jccb(Assembler::notZero, L_decrement_retry) ;
1599 
1600   // Appears unlocked - try to swing _owner from null to non-null.
1601   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1602 #ifdef _LP64
1603   Register threadReg = r15_thread;
1604 #else
1605   get_thread(scrReg);
1606   Register threadReg = scrReg;
1607 #endif
1608   if (os::is_MP()) {
1609     lock();
1610   }
1611   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
1612 
1613   if (RTMRetryCount > 0) {
1614     // success done else retry
1615     jccb(Assembler::equal, DONE_LABEL) ;
1616     bind(L_decrement_retry);
1617     // Spin and retry if lock is busy.
1618     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
1619   }
1620   else {
1621     bind(L_decrement_retry);
1622   }
1623 }
1624 
1625 #endif //  INCLUDE_RTM_OPT
1626 
1627 // Fast_Lock and Fast_Unlock used by C2
1628 
1629 // Because the transitions from emitted code to the runtime
1630 // monitorenter/exit helper stubs are so slow it's critical that
1631 // we inline both the stack-locking fast-path and the inflated fast path.
1632 //
1633 // See also: cmpFastLock and cmpFastUnlock.
1634 //
1635 // What follows is a specialized inline transliteration of the code
1636 // in slow_enter() and slow_exit().  If we're concerned about I$ bloat
1637 // another option would be to emit TrySlowEnter and TrySlowExit methods
1638 // at startup-time.  These methods would accept arguments as
1639 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
1640 // indications in the icc.ZFlag.  Fast_Lock and Fast_Unlock would simply
1641 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
1642 // In practice, however, the # of lock sites is bounded and is usually small.
1643 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
1644 // if the processor uses simple bimodal branch predictors keyed by EIP
1645 // Since the helper routines would be called from multiple synchronization
1646 // sites.
1647 //
1648 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
1649 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
1650 // to those specialized methods.  That'd give us a mostly platform-independent
1651 // implementation that the JITs could optimize and inline at their pleasure.
1652 // Done correctly, the only time we'd need to cross to native could would be
1653 // to park() or unpark() threads.  We'd also need a few more unsafe operators
1654 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
1655 // (b) explicit barriers or fence operations.
1656 //
1657 // TODO:
1658 //
1659 // *  Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
1660 //    This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
1661 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
1662 //    the lock operators would typically be faster than reifying Self.
1663 //
1664 // *  Ideally I'd define the primitives as:
1665 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
1666 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
1667 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
1668 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
1669 //    Furthermore the register assignments are overconstrained, possibly resulting in
1670 //    sub-optimal code near the synchronization site.
1671 //
1672 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
1673 //    Alternately, use a better sp-proximity test.
1674 //
1675 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
1676 //    Either one is sufficient to uniquely identify a thread.
1677 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
1678 //
1679 // *  Intrinsify notify() and notifyAll() for the common cases where the
1680 //    object is locked by the calling thread but the waitlist is empty.
1681 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
1682 //
1683 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
1684 //    But beware of excessive branch density on AMD Opterons.
1685 //
1686 // *  Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
1687 //    or failure of the fast-path.  If the fast-path fails then we pass
1688 //    control to the slow-path, typically in C.  In Fast_Lock and
1689 //    Fast_Unlock we often branch to DONE_LABEL, just to find that C2
1690 //    will emit a conditional branch immediately after the node.
1691 //    So we have branches to branches and lots of ICC.ZF games.
1692 //    Instead, it might be better to have C2 pass a "FailureLabel"
1693 //    into Fast_Lock and Fast_Unlock.  In the case of success, control
1694 //    will drop through the node.  ICC.ZF is undefined at exit.
1695 //    In the case of failure, the node will branch directly to the
1696 //    FailureLabel
1697 
1698 
1699 // obj: object to lock
1700 // box: on-stack box address (displaced header location) - KILLED
1701 // rax,: tmp -- KILLED
1702 // scr: tmp -- KILLED
1703 void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
1704                                Register scrReg, Register cx1Reg, Register cx2Reg,
1705                                BiasedLockingCounters* counters,
1706                                RTMLockingCounters* rtm_counters,
1707                                RTMLockingCounters* stack_rtm_counters,
1708                                Metadata* method_data,
1709                                bool use_rtm, bool profile_rtm) {
1710   // Ensure the register assignments are disjoint
1711   assert(tmpReg == rax, "");
1712 
1713   if (use_rtm) {
1714     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
1715   } else {
1716     assert(cx1Reg == noreg, "");
1717     assert(cx2Reg == noreg, "");
1718     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
1719   }
1720 
1721   if (counters != NULL) {
1722     atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
1723   }
1724 
1725   // Possible cases that we'll encounter in fast_lock
1726   // ------------------------------------------------
1727   // * Inflated
1728   //    -- unlocked
1729   //    -- Locked
1730   //       = by self
1731   //       = by other
1732   // * biased
1733   //    -- by Self
1734   //    -- by other
1735   // * neutral
1736   // * stack-locked
1737   //    -- by self
1738   //       = sp-proximity test hits
1739   //       = sp-proximity test generates false-negative
1740   //    -- by other
1741   //
1742 
1743   Label IsInflated, DONE_LABEL;
1744 
1745   // it's stack-locked, biased or neutral
1746   // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
1747   // order to reduce the number of conditional branches in the most common cases.
1748   // Beware -- there's a subtle invariant that fetch of the markword
1749   // at [FETCH], below, will never observe a biased encoding (*101b).
1750   // If this invariant is not held we risk exclusion (safety) failure.
1751   if (UseBiasedLocking && !UseOptoBiasInlining) {
1752     biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
1753   }
1754 
1755 #if INCLUDE_RTM_OPT
1756   if (UseRTMForStackLocks && use_rtm) {
1757     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
1758                       stack_rtm_counters, method_data, profile_rtm,
1759                       DONE_LABEL, IsInflated);
1760   }
1761 #endif // INCLUDE_RTM_OPT
1762 
1763   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
1764   testptr(tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased
1765   jccb(Assembler::notZero, IsInflated);
1766 
1767   // Attempt stack-locking ...
1768   orptr (tmpReg, markOopDesc::unlocked_value);
1769   movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
1770   if (os::is_MP()) {
1771     lock();
1772   }
1773   cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
1774   if (counters != NULL) {
1775     cond_inc32(Assembler::equal,
1776                ExternalAddress((address)counters->fast_path_entry_count_addr()));
1777   }
1778   jcc(Assembler::equal, DONE_LABEL);           // Success
1779 
1780   // Recursive locking.
1781   // The object is stack-locked: markword contains stack pointer to BasicLock.
1782   // Locked by current thread if difference with current SP is less than one page.
1783   subptr(tmpReg, rsp);
1784   // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
1785   andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
1786   movptr(Address(boxReg, 0), tmpReg);
1787   if (counters != NULL) {
1788     cond_inc32(Assembler::equal,
1789                ExternalAddress((address)counters->fast_path_entry_count_addr()));
1790   }
1791   jmp(DONE_LABEL);
1792 
1793   bind(IsInflated);
1794   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markOopDesc::monitor_value
1795 
1796 #if INCLUDE_RTM_OPT
1797   // Use the same RTM locking code in 32- and 64-bit VM.
1798   if (use_rtm) {
1799     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
1800                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
1801   } else {
1802 #endif // INCLUDE_RTM_OPT
1803 
1804 #ifndef _LP64
1805   // The object is inflated.
1806 
1807   // boxReg refers to the on-stack BasicLock in the current frame.
1808   // We'd like to write:
1809   //   set box->_displaced_header = markOopDesc::unused_mark().  Any non-0 value suffices.
1810   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
1811   // additional latency as we have another ST in the store buffer that must drain.
1812 
1813   // avoid ST-before-CAS
1814   // register juggle because we need tmpReg for cmpxchgptr below
1815   movptr(scrReg, boxReg);
1816   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
1817 
1818   // Optimistic form: consider XORL tmpReg,tmpReg
1819   movptr(tmpReg, NULL_WORD);
1820 
1821   // Appears unlocked - try to swing _owner from null to non-null.
1822   // Ideally, I'd manifest "Self" with get_thread and then attempt
1823   // to CAS the register containing Self into m->Owner.
1824   // But we don't have enough registers, so instead we can either try to CAS
1825   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
1826   // we later store "Self" into m->Owner.  Transiently storing a stack address
1827   // (rsp or the address of the box) into  m->owner is harmless.
1828   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1829   if (os::is_MP()) {
1830     lock();
1831   }
1832   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1833   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
1834   // If we weren't able to swing _owner from NULL to the BasicLock
1835   // then take the slow path.
1836   jccb  (Assembler::notZero, DONE_LABEL);
1837   // update _owner from BasicLock to thread
1838   get_thread (scrReg);                    // beware: clobbers ICCs
1839   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
1840   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
1841 
1842   // If the CAS fails we can either retry or pass control to the slow-path.
1843   // We use the latter tactic.
1844   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1845   // If the CAS was successful ...
1846   //   Self has acquired the lock
1847   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1848   // Intentional fall-through into DONE_LABEL ...
1849 #else // _LP64
1850   // It's inflated
1851   movq(scrReg, tmpReg);
1852   xorq(tmpReg, tmpReg);
1853 
1854   if (os::is_MP()) {
1855     lock();
1856   }
1857   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1858   // Unconditionally set box->_displaced_header = markOopDesc::unused_mark().
1859   // Without cast to int32_t movptr will destroy r10 which is typically obj.
1860   movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1861   // Intentional fall-through into DONE_LABEL ...
1862   // Propagate ICC.ZF from CAS above into DONE_LABEL.
1863 #endif // _LP64
1864 #if INCLUDE_RTM_OPT
1865   } // use_rtm()
1866 #endif
1867   // DONE_LABEL is a hot target - we'd really like to place it at the
1868   // start of cache line by padding with NOPs.
1869   // See the AMD and Intel software optimization manuals for the
1870   // most efficient "long" NOP encodings.
1871   // Unfortunately none of our alignment mechanisms suffice.
1872   bind(DONE_LABEL);
1873 
1874   // At DONE_LABEL the icc ZFlag is set as follows ...
1875   // Fast_Unlock uses the same protocol.
1876   // ZFlag == 1 -> Success
1877   // ZFlag == 0 -> Failure - force control through the slow-path
1878 }
1879 
1880 // obj: object to unlock
1881 // box: box address (displaced header location), killed.  Must be EAX.
1882 // tmp: killed, cannot be obj nor box.
1883 //
1884 // Some commentary on balanced locking:
1885 //
1886 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
1887 // Methods that don't have provably balanced locking are forced to run in the
1888 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1889 // The interpreter provides two properties:
1890 // I1:  At return-time the interpreter automatically and quietly unlocks any
1891 //      objects acquired the current activation (frame).  Recall that the
1892 //      interpreter maintains an on-stack list of locks currently held by
1893 //      a frame.
1894 // I2:  If a method attempts to unlock an object that is not held by the
1895 //      the frame the interpreter throws IMSX.
1896 //
1897 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
1898 // B() doesn't have provably balanced locking so it runs in the interpreter.
1899 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
1900 // is still locked by A().
1901 //
1902 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
1903 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
1904 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
1905 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
1906 // Arguably given that the spec legislates the JNI case as undefined our implementation
1907 // could reasonably *avoid* checking owner in Fast_Unlock().
1908 // In the interest of performance we elide m->Owner==Self check in unlock.
1909 // A perfectly viable alternative is to elide the owner check except when
1910 // Xcheck:jni is enabled.
1911 
1912 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
1913   assert(boxReg == rax, "");
1914   assert_different_registers(objReg, boxReg, tmpReg);
1915 
1916   Label DONE_LABEL, Stacked, CheckSucc;
1917 
1918   // Critically, the biased locking test must have precedence over
1919   // and appear before the (box->dhw == 0) recursive stack-lock test.
1920   if (UseBiasedLocking && !UseOptoBiasInlining) {
1921     biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1922   }
1923 
1924 #if INCLUDE_RTM_OPT
1925   if (UseRTMForStackLocks && use_rtm) {
1926     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1927     Label L_regular_unlock;
1928     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));           // fetch markword
1929     andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
1930     cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
1931     jccb(Assembler::notEqual, L_regular_unlock);  // if !HLE RegularLock
1932     xend();                                       // otherwise end...
1933     jmp(DONE_LABEL);                              // ... and we're done
1934     bind(L_regular_unlock);
1935   }
1936 #endif
1937 
1938   cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
1939   jcc   (Assembler::zero, DONE_LABEL);            // 0 indicates recursive stack-lock
1940   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));             // Examine the object's markword
1941   testptr(tmpReg, markOopDesc::monitor_value);    // Inflated?
1942   jccb  (Assembler::zero, Stacked);
1943 
1944   // It's inflated.
1945 #if INCLUDE_RTM_OPT
1946   if (use_rtm) {
1947     Label L_regular_inflated_unlock;
1948     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1949     movptr(boxReg, Address(tmpReg, owner_offset));
1950     testptr(boxReg, boxReg);
1951     jccb(Assembler::notZero, L_regular_inflated_unlock);
1952     xend();
1953     jmpb(DONE_LABEL);
1954     bind(L_regular_inflated_unlock);
1955   }
1956 #endif
1957 
1958   // Despite our balanced locking property we still check that m->_owner == Self
1959   // as java routines or native JNI code called by this thread might
1960   // have released the lock.
1961   // Refer to the comments in synchronizer.cpp for how we might encode extra
1962   // state in _succ so we can avoid fetching EntryList|cxq.
1963   //
1964   // I'd like to add more cases in fast_lock() and fast_unlock() --
1965   // such as recursive enter and exit -- but we have to be wary of
1966   // I$ bloat, T$ effects and BP$ effects.
1967   //
1968   // If there's no contention try a 1-0 exit.  That is, exit without
1969   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
1970   // we detect and recover from the race that the 1-0 exit admits.
1971   //
1972   // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
1973   // before it STs null into _owner, releasing the lock.  Updates
1974   // to data protected by the critical section must be visible before
1975   // we drop the lock (and thus before any other thread could acquire
1976   // the lock and observe the fields protected by the lock).
1977   // IA32's memory-model is SPO, so STs are ordered with respect to
1978   // each other and there's no need for an explicit barrier (fence).
1979   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
1980 #ifndef _LP64
1981   get_thread (boxReg);
1982 
1983   // Note that we could employ various encoding schemes to reduce
1984   // the number of loads below (currently 4) to just 2 or 3.
1985   // Refer to the comments in synchronizer.cpp.
1986   // In practice the chain of fetches doesn't seem to impact performance, however.
1987   xorptr(boxReg, boxReg);
1988   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1989   jccb  (Assembler::notZero, DONE_LABEL);
1990   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1991   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1992   jccb  (Assembler::notZero, CheckSucc);
1993   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1994   jmpb  (DONE_LABEL);
1995 
1996   bind (Stacked);
1997   // It's not inflated and it's not recursively stack-locked and it's not biased.
1998   // It must be stack-locked.
1999   // Try to reset the header to displaced header.
2000   // The "box" value on the stack is stable, so we can reload
2001   // and be assured we observe the same value as above.
2002   movptr(tmpReg, Address(boxReg, 0));
2003   if (os::is_MP()) {
2004     lock();
2005   }
2006   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
2007   // Intention fall-thru into DONE_LABEL
2008 
2009   // DONE_LABEL is a hot target - we'd really like to place it at the
2010   // start of cache line by padding with NOPs.
2011   // See the AMD and Intel software optimization manuals for the
2012   // most efficient "long" NOP encodings.
2013   // Unfortunately none of our alignment mechanisms suffice.
2014   bind (CheckSucc);
2015 #else // _LP64
2016   // It's inflated
2017   xorptr(boxReg, boxReg);
2018   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2019   jccb  (Assembler::notZero, DONE_LABEL);
2020   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2021   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2022   jccb  (Assembler::notZero, CheckSucc);
2023   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2024   jmpb  (DONE_LABEL);
2025 
2026   // Try to avoid passing control into the slow_path ...
2027   Label LSuccess, LGoSlowPath ;
2028   bind  (CheckSucc);
2029 
2030   // The following optional optimization can be elided if necessary
2031   // Effectively: if (succ == null) goto SlowPath
2032   // The code reduces the window for a race, however,
2033   // and thus benefits performance.
2034   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2035   jccb  (Assembler::zero, LGoSlowPath);
2036 
2037   xorptr(boxReg, boxReg);
2038   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2039   if (os::is_MP()) {
2040     // Memory barrier/fence
2041     // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
2042     // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
2043     // This is faster on Nehalem and AMD Shanghai/Barcelona.
2044     // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2045     // We might also restructure (ST Owner=0;barrier;LD _Succ) to
2046     // (mov box,0; xchgq box, &m->Owner; LD _succ) .
2047     lock(); addl(Address(rsp, 0), 0);
2048   }
2049   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2050   jccb  (Assembler::notZero, LSuccess);
2051 
2052   // Rare inopportune interleaving - race.
2053   // The successor vanished in the small window above.
2054   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
2055   // We need to ensure progress and succession.
2056   // Try to reacquire the lock.
2057   // If that fails then the new owner is responsible for succession and this
2058   // thread needs to take no further action and can exit via the fast path (success).
2059   // If the re-acquire succeeds then pass control into the slow path.
2060   // As implemented, this latter mode is horrible because we generated more
2061   // coherence traffic on the lock *and* artifically extended the critical section
2062   // length while by virtue of passing control into the slow path.
2063 
2064   // box is really RAX -- the following CMPXCHG depends on that binding
2065   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
2066   if (os::is_MP()) { lock(); }
2067   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2068   // There's no successor so we tried to regrab the lock.
2069   // If that didn't work, then another thread grabbed the
2070   // lock so we're done (and exit was a success).
2071   jccb  (Assembler::notEqual, LSuccess);
2072   // Intentional fall-through into slow-path
2073 
2074   bind  (LGoSlowPath);
2075   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2076   jmpb  (DONE_LABEL);
2077 
2078   bind  (LSuccess);
2079   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
2080   jmpb  (DONE_LABEL);
2081 
2082   bind  (Stacked);
2083   movptr(tmpReg, Address (boxReg, 0));      // re-fetch
2084   if (os::is_MP()) { lock(); }
2085   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
2086 
2087 #endif
2088   bind(DONE_LABEL);
2089 }
2090 #endif // COMPILER2
2091 
2092 void MacroAssembler::c2bool(Register x) {
2093   // implements x == 0 ? 0 : 1
2094   // note: must only look at least-significant byte of x
2095   //       since C-style booleans are stored in one byte
2096   //       only! (was bug)
2097   andl(x, 0xFF);
2098   setb(Assembler::notZero, x);
2099 }
2100 
2101 // Wouldn't need if AddressLiteral version had new name
2102 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
2103   Assembler::call(L, rtype);
2104 }
2105 
2106 void MacroAssembler::call(Register entry) {
2107   Assembler::call(entry);
2108 }
2109 
2110 void MacroAssembler::call(AddressLiteral entry) {
2111   if (reachable(entry)) {
2112     Assembler::call_literal(entry.target(), entry.rspec());
2113   } else {
2114     lea(rscratch1, entry);
2115     Assembler::call(rscratch1);
2116   }
2117 }
2118 
2119 void MacroAssembler::ic_call(address entry, jint method_index) {
2120   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
2121   movptr(rax, (intptr_t)Universe::non_oop_word());
2122   call(AddressLiteral(entry, rh));
2123 }
2124 
2125 // Implementation of call_VM versions
2126 
2127 void MacroAssembler::call_VM(Register oop_result,
2128                              address entry_point,
2129                              bool check_exceptions) {
2130   Label C, E;
2131   call(C, relocInfo::none);
2132   jmp(E);
2133 
2134   bind(C);
2135   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
2136   ret(0);
2137 
2138   bind(E);
2139 }
2140 
2141 void MacroAssembler::call_VM(Register oop_result,
2142                              address entry_point,
2143                              Register arg_1,
2144                              bool check_exceptions) {
2145   Label C, E;
2146   call(C, relocInfo::none);
2147   jmp(E);
2148 
2149   bind(C);
2150   pass_arg1(this, arg_1);
2151   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
2152   ret(0);
2153 
2154   bind(E);
2155 }
2156 
2157 void MacroAssembler::call_VM(Register oop_result,
2158                              address entry_point,
2159                              Register arg_1,
2160                              Register arg_2,
2161                              bool check_exceptions) {
2162   Label C, E;
2163   call(C, relocInfo::none);
2164   jmp(E);
2165 
2166   bind(C);
2167 
2168   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2169 
2170   pass_arg2(this, arg_2);
2171   pass_arg1(this, arg_1);
2172   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
2173   ret(0);
2174 
2175   bind(E);
2176 }
2177 
2178 void MacroAssembler::call_VM(Register oop_result,
2179                              address entry_point,
2180                              Register arg_1,
2181                              Register arg_2,
2182                              Register arg_3,
2183                              bool check_exceptions) {
2184   Label C, E;
2185   call(C, relocInfo::none);
2186   jmp(E);
2187 
2188   bind(C);
2189 
2190   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2191   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2192   pass_arg3(this, arg_3);
2193 
2194   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2195   pass_arg2(this, arg_2);
2196 
2197   pass_arg1(this, arg_1);
2198   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
2199   ret(0);
2200 
2201   bind(E);
2202 }
2203 
2204 void MacroAssembler::call_VM(Register oop_result,
2205                              Register last_java_sp,
2206                              address entry_point,
2207                              int number_of_arguments,
2208                              bool check_exceptions) {
2209   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2210   call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2211 }
2212 
2213 void MacroAssembler::call_VM(Register oop_result,
2214                              Register last_java_sp,
2215                              address entry_point,
2216                              Register arg_1,
2217                              bool check_exceptions) {
2218   pass_arg1(this, arg_1);
2219   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2220 }
2221 
2222 void MacroAssembler::call_VM(Register oop_result,
2223                              Register last_java_sp,
2224                              address entry_point,
2225                              Register arg_1,
2226                              Register arg_2,
2227                              bool check_exceptions) {
2228 
2229   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2230   pass_arg2(this, arg_2);
2231   pass_arg1(this, arg_1);
2232   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2233 }
2234 
2235 void MacroAssembler::call_VM(Register oop_result,
2236                              Register last_java_sp,
2237                              address entry_point,
2238                              Register arg_1,
2239                              Register arg_2,
2240                              Register arg_3,
2241                              bool check_exceptions) {
2242   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2243   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2244   pass_arg3(this, arg_3);
2245   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2246   pass_arg2(this, arg_2);
2247   pass_arg1(this, arg_1);
2248   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2249 }
2250 
2251 void MacroAssembler::super_call_VM(Register oop_result,
2252                                    Register last_java_sp,
2253                                    address entry_point,
2254                                    int number_of_arguments,
2255                                    bool check_exceptions) {
2256   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2257   MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2258 }
2259 
2260 void MacroAssembler::super_call_VM(Register oop_result,
2261                                    Register last_java_sp,
2262                                    address entry_point,
2263                                    Register arg_1,
2264                                    bool check_exceptions) {
2265   pass_arg1(this, arg_1);
2266   super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2267 }
2268 
2269 void MacroAssembler::super_call_VM(Register oop_result,
2270                                    Register last_java_sp,
2271                                    address entry_point,
2272                                    Register arg_1,
2273                                    Register arg_2,
2274                                    bool check_exceptions) {
2275 
2276   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2277   pass_arg2(this, arg_2);
2278   pass_arg1(this, arg_1);
2279   super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2280 }
2281 
2282 void MacroAssembler::super_call_VM(Register oop_result,
2283                                    Register last_java_sp,
2284                                    address entry_point,
2285                                    Register arg_1,
2286                                    Register arg_2,
2287                                    Register arg_3,
2288                                    bool check_exceptions) {
2289   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2290   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2291   pass_arg3(this, arg_3);
2292   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2293   pass_arg2(this, arg_2);
2294   pass_arg1(this, arg_1);
2295   super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2296 }
2297 
2298 void MacroAssembler::call_VM_base(Register oop_result,
2299                                   Register java_thread,
2300                                   Register last_java_sp,
2301                                   address  entry_point,
2302                                   int      number_of_arguments,
2303                                   bool     check_exceptions) {
2304   // determine java_thread register
2305   if (!java_thread->is_valid()) {
2306 #ifdef _LP64
2307     java_thread = r15_thread;
2308 #else
2309     java_thread = rdi;
2310     get_thread(java_thread);
2311 #endif // LP64
2312   }
2313   // determine last_java_sp register
2314   if (!last_java_sp->is_valid()) {
2315     last_java_sp = rsp;
2316   }
2317   // debugging support
2318   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
2319   LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
2320 #ifdef ASSERT
2321   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
2322   // r12 is the heapbase.
2323   LP64_ONLY(if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
2324 #endif // ASSERT
2325 
2326   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
2327   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
2328 
2329   // push java thread (becomes first argument of C function)
2330 
2331   NOT_LP64(push(java_thread); number_of_arguments++);
2332   LP64_ONLY(mov(c_rarg0, r15_thread));
2333 
2334   // set last Java frame before call
2335   assert(last_java_sp != rbp, "can't use ebp/rbp");
2336 
2337   // Only interpreter should have to set fp
2338   set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
2339 
2340   // do the call, remove parameters
2341   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
2342 
2343   // restore the thread (cannot use the pushed argument since arguments
2344   // may be overwritten by C code generated by an optimizing compiler);
2345   // however can use the register value directly if it is callee saved.
2346   if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
2347     // rdi & rsi (also r15) are callee saved -> nothing to do
2348 #ifdef ASSERT
2349     guarantee(java_thread != rax, "change this code");
2350     push(rax);
2351     { Label L;
2352       get_thread(rax);
2353       cmpptr(java_thread, rax);
2354       jcc(Assembler::equal, L);
2355       STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
2356       bind(L);
2357     }
2358     pop(rax);
2359 #endif
2360   } else {
2361     get_thread(java_thread);
2362   }
2363   // reset last Java frame
2364   // Only interpreter should have to clear fp
2365   reset_last_Java_frame(java_thread, true);
2366 
2367    // C++ interp handles this in the interpreter
2368   check_and_handle_popframe(java_thread);
2369   check_and_handle_earlyret(java_thread);
2370 
2371   if (check_exceptions) {
2372     // check for pending exceptions (java_thread is set upon return)
2373     cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
2374 #ifndef _LP64
2375     jump_cc(Assembler::notEqual,
2376             RuntimeAddress(StubRoutines::forward_exception_entry()));
2377 #else
2378     // This used to conditionally jump to forward_exception however it is
2379     // possible if we relocate that the branch will not reach. So we must jump
2380     // around so we can always reach
2381 
2382     Label ok;
2383     jcc(Assembler::equal, ok);
2384     jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2385     bind(ok);
2386 #endif // LP64
2387   }
2388 
2389   // get oop result if there is one and reset the value in the thread
2390   if (oop_result->is_valid()) {
2391     get_vm_result(oop_result, java_thread);
2392   }
2393 }
2394 
2395 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
2396 
2397   // Calculate the value for last_Java_sp
2398   // somewhat subtle. call_VM does an intermediate call
2399   // which places a return address on the stack just under the
2400   // stack pointer as the user finsihed with it. This allows
2401   // use to retrieve last_Java_pc from last_Java_sp[-1].
2402   // On 32bit we then have to push additional args on the stack to accomplish
2403   // the actual requested call. On 64bit call_VM only can use register args
2404   // so the only extra space is the return address that call_VM created.
2405   // This hopefully explains the calculations here.
2406 
2407 #ifdef _LP64
2408   // We've pushed one address, correct last_Java_sp
2409   lea(rax, Address(rsp, wordSize));
2410 #else
2411   lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
2412 #endif // LP64
2413 
2414   call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
2415 
2416 }
2417 
2418 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
2419 void MacroAssembler::call_VM_leaf0(address entry_point) {
2420   MacroAssembler::call_VM_leaf_base(entry_point, 0);
2421 }
2422 
2423 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
2424   call_VM_leaf_base(entry_point, number_of_arguments);
2425 }
2426 
2427 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
2428   pass_arg0(this, arg_0);
2429   call_VM_leaf(entry_point, 1);
2430 }
2431 
2432 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2433 
2434   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2435   pass_arg1(this, arg_1);
2436   pass_arg0(this, arg_0);
2437   call_VM_leaf(entry_point, 2);
2438 }
2439 
2440 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2441   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2442   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2443   pass_arg2(this, arg_2);
2444   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2445   pass_arg1(this, arg_1);
2446   pass_arg0(this, arg_0);
2447   call_VM_leaf(entry_point, 3);
2448 }
2449 
2450 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
2451   pass_arg0(this, arg_0);
2452   MacroAssembler::call_VM_leaf_base(entry_point, 1);
2453 }
2454 
2455 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2456 
2457   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2458   pass_arg1(this, arg_1);
2459   pass_arg0(this, arg_0);
2460   MacroAssembler::call_VM_leaf_base(entry_point, 2);
2461 }
2462 
2463 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2464   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2465   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2466   pass_arg2(this, arg_2);
2467   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2468   pass_arg1(this, arg_1);
2469   pass_arg0(this, arg_0);
2470   MacroAssembler::call_VM_leaf_base(entry_point, 3);
2471 }
2472 
2473 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
2474   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
2475   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2476   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2477   pass_arg3(this, arg_3);
2478   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2479   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2480   pass_arg2(this, arg_2);
2481   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2482   pass_arg1(this, arg_1);
2483   pass_arg0(this, arg_0);
2484   MacroAssembler::call_VM_leaf_base(entry_point, 4);
2485 }
2486 
2487 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
2488   movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
2489   movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
2490   verify_oop(oop_result, "broken oop in call_VM_base");
2491 }
2492 
2493 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
2494   movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
2495   movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
2496 }
2497 
2498 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
2499 }
2500 
2501 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
2502 }
2503 
2504 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
2505   if (reachable(src1)) {
2506     cmpl(as_Address(src1), imm);
2507   } else {
2508     lea(rscratch1, src1);
2509     cmpl(Address(rscratch1, 0), imm);
2510   }
2511 }
2512 
2513 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
2514   assert(!src2.is_lval(), "use cmpptr");
2515   if (reachable(src2)) {
2516     cmpl(src1, as_Address(src2));
2517   } else {
2518     lea(rscratch1, src2);
2519     cmpl(src1, Address(rscratch1, 0));
2520   }
2521 }
2522 
2523 void MacroAssembler::cmp32(Register src1, int32_t imm) {
2524   Assembler::cmpl(src1, imm);
2525 }
2526 
2527 void MacroAssembler::cmp32(Register src1, Address src2) {
2528   Assembler::cmpl(src1, src2);
2529 }
2530 
2531 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2532   ucomisd(opr1, opr2);
2533 
2534   Label L;
2535   if (unordered_is_less) {
2536     movl(dst, -1);
2537     jcc(Assembler::parity, L);
2538     jcc(Assembler::below , L);
2539     movl(dst, 0);
2540     jcc(Assembler::equal , L);
2541     increment(dst);
2542   } else { // unordered is greater
2543     movl(dst, 1);
2544     jcc(Assembler::parity, L);
2545     jcc(Assembler::above , L);
2546     movl(dst, 0);
2547     jcc(Assembler::equal , L);
2548     decrementl(dst);
2549   }
2550   bind(L);
2551 }
2552 
2553 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2554   ucomiss(opr1, opr2);
2555 
2556   Label L;
2557   if (unordered_is_less) {
2558     movl(dst, -1);
2559     jcc(Assembler::parity, L);
2560     jcc(Assembler::below , L);
2561     movl(dst, 0);
2562     jcc(Assembler::equal , L);
2563     increment(dst);
2564   } else { // unordered is greater
2565     movl(dst, 1);
2566     jcc(Assembler::parity, L);
2567     jcc(Assembler::above , L);
2568     movl(dst, 0);
2569     jcc(Assembler::equal , L);
2570     decrementl(dst);
2571   }
2572   bind(L);
2573 }
2574 
2575 
2576 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
2577   if (reachable(src1)) {
2578     cmpb(as_Address(src1), imm);
2579   } else {
2580     lea(rscratch1, src1);
2581     cmpb(Address(rscratch1, 0), imm);
2582   }
2583 }
2584 
2585 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
2586 #ifdef _LP64
2587   if (src2.is_lval()) {
2588     movptr(rscratch1, src2);
2589     Assembler::cmpq(src1, rscratch1);
2590   } else if (reachable(src2)) {
2591     cmpq(src1, as_Address(src2));
2592   } else {
2593     lea(rscratch1, src2);
2594     Assembler::cmpq(src1, Address(rscratch1, 0));
2595   }
2596 #else
2597   if (src2.is_lval()) {
2598     cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2599   } else {
2600     cmpl(src1, as_Address(src2));
2601   }
2602 #endif // _LP64
2603 }
2604 
2605 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
2606   assert(src2.is_lval(), "not a mem-mem compare");
2607 #ifdef _LP64
2608   // moves src2's literal address
2609   movptr(rscratch1, src2);
2610   Assembler::cmpq(src1, rscratch1);
2611 #else
2612   cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2613 #endif // _LP64
2614 }
2615 
2616 void MacroAssembler::cmpoop(Register src1, Register src2) {
2617   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2618   bs->obj_equals(this, src1, src2);
2619 }
2620 
2621 void MacroAssembler::cmpoop(Register src1, Address src2) {
2622   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2623   bs->obj_equals(this, src1, src2);
2624 }
2625 
2626 #ifdef _LP64
2627 void MacroAssembler::cmpoop(Register src1, jobject src2) {
2628   movoop(rscratch1, src2);
2629   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2630   bs->obj_equals(this, src1, rscratch1);
2631 }
2632 #endif
2633 
2634 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
2635   if (reachable(adr)) {
2636     if (os::is_MP())
2637       lock();
2638     cmpxchgptr(reg, as_Address(adr));
2639   } else {
2640     lea(rscratch1, adr);
2641     if (os::is_MP())
2642       lock();
2643     cmpxchgptr(reg, Address(rscratch1, 0));
2644   }
2645 }
2646 
2647 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
2648   LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
2649 }
2650 
2651 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
2652   if (reachable(src)) {
2653     Assembler::comisd(dst, as_Address(src));
2654   } else {
2655     lea(rscratch1, src);
2656     Assembler::comisd(dst, Address(rscratch1, 0));
2657   }
2658 }
2659 
2660 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
2661   if (reachable(src)) {
2662     Assembler::comiss(dst, as_Address(src));
2663   } else {
2664     lea(rscratch1, src);
2665     Assembler::comiss(dst, Address(rscratch1, 0));
2666   }
2667 }
2668 
2669 
2670 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
2671   Condition negated_cond = negate_condition(cond);
2672   Label L;
2673   jcc(negated_cond, L);
2674   pushf(); // Preserve flags
2675   atomic_incl(counter_addr);
2676   popf();
2677   bind(L);
2678 }
2679 
2680 int MacroAssembler::corrected_idivl(Register reg) {
2681   // Full implementation of Java idiv and irem; checks for
2682   // special case as described in JVM spec., p.243 & p.271.
2683   // The function returns the (pc) offset of the idivl
2684   // instruction - may be needed for implicit exceptions.
2685   //
2686   //         normal case                           special case
2687   //
2688   // input : rax,: dividend                         min_int
2689   //         reg: divisor   (may not be rax,/rdx)   -1
2690   //
2691   // output: rax,: quotient  (= rax, idiv reg)       min_int
2692   //         rdx: remainder (= rax, irem reg)       0
2693   assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
2694   const int min_int = 0x80000000;
2695   Label normal_case, special_case;
2696 
2697   // check for special case
2698   cmpl(rax, min_int);
2699   jcc(Assembler::notEqual, normal_case);
2700   xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
2701   cmpl(reg, -1);
2702   jcc(Assembler::equal, special_case);
2703 
2704   // handle normal case
2705   bind(normal_case);
2706   cdql();
2707   int idivl_offset = offset();
2708   idivl(reg);
2709 
2710   // normal and special case exit
2711   bind(special_case);
2712 
2713   return idivl_offset;
2714 }
2715 
2716 
2717 
2718 void MacroAssembler::decrementl(Register reg, int value) {
2719   if (value == min_jint) {subl(reg, value) ; return; }
2720   if (value <  0) { incrementl(reg, -value); return; }
2721   if (value == 0) {                        ; return; }
2722   if (value == 1 && UseIncDec) { decl(reg) ; return; }
2723   /* else */      { subl(reg, value)       ; return; }
2724 }
2725 
2726 void MacroAssembler::decrementl(Address dst, int value) {
2727   if (value == min_jint) {subl(dst, value) ; return; }
2728   if (value <  0) { incrementl(dst, -value); return; }
2729   if (value == 0) {                        ; return; }
2730   if (value == 1 && UseIncDec) { decl(dst) ; return; }
2731   /* else */      { subl(dst, value)       ; return; }
2732 }
2733 
2734 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
2735   assert (shift_value > 0, "illegal shift value");
2736   Label _is_positive;
2737   testl (reg, reg);
2738   jcc (Assembler::positive, _is_positive);
2739   int offset = (1 << shift_value) - 1 ;
2740 
2741   if (offset == 1) {
2742     incrementl(reg);
2743   } else {
2744     addl(reg, offset);
2745   }
2746 
2747   bind (_is_positive);
2748   sarl(reg, shift_value);
2749 }
2750 
2751 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
2752   if (reachable(src)) {
2753     Assembler::divsd(dst, as_Address(src));
2754   } else {
2755     lea(rscratch1, src);
2756     Assembler::divsd(dst, Address(rscratch1, 0));
2757   }
2758 }
2759 
2760 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
2761   if (reachable(src)) {
2762     Assembler::divss(dst, as_Address(src));
2763   } else {
2764     lea(rscratch1, src);
2765     Assembler::divss(dst, Address(rscratch1, 0));
2766   }
2767 }
2768 
2769 // !defined(COMPILER2) is because of stupid core builds
2770 #if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2) || INCLUDE_JVMCI
2771 void MacroAssembler::empty_FPU_stack() {
2772   if (VM_Version::supports_mmx()) {
2773     emms();
2774   } else {
2775     for (int i = 8; i-- > 0; ) ffree(i);
2776   }
2777 }
2778 #endif // !LP64 || C1 || !C2 || INCLUDE_JVMCI
2779 
2780 
2781 void MacroAssembler::enter() {
2782   push(rbp);
2783   mov(rbp, rsp);
2784 }
2785 
2786 // A 5 byte nop that is safe for patching (see patch_verified_entry)
2787 void MacroAssembler::fat_nop() {
2788   if (UseAddressNop) {
2789     addr_nop_5();
2790   } else {
2791     emit_int8(0x26); // es:
2792     emit_int8(0x2e); // cs:
2793     emit_int8(0x64); // fs:
2794     emit_int8(0x65); // gs:
2795     emit_int8((unsigned char)0x90);
2796   }
2797 }
2798 
2799 void MacroAssembler::fcmp(Register tmp) {
2800   fcmp(tmp, 1, true, true);
2801 }
2802 
2803 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
2804   assert(!pop_right || pop_left, "usage error");
2805   if (VM_Version::supports_cmov()) {
2806     assert(tmp == noreg, "unneeded temp");
2807     if (pop_left) {
2808       fucomip(index);
2809     } else {
2810       fucomi(index);
2811     }
2812     if (pop_right) {
2813       fpop();
2814     }
2815   } else {
2816     assert(tmp != noreg, "need temp");
2817     if (pop_left) {
2818       if (pop_right) {
2819         fcompp();
2820       } else {
2821         fcomp(index);
2822       }
2823     } else {
2824       fcom(index);
2825     }
2826     // convert FPU condition into eflags condition via rax,
2827     save_rax(tmp);
2828     fwait(); fnstsw_ax();
2829     sahf();
2830     restore_rax(tmp);
2831   }
2832   // condition codes set as follows:
2833   //
2834   // CF (corresponds to C0) if x < y
2835   // PF (corresponds to C2) if unordered
2836   // ZF (corresponds to C3) if x = y
2837 }
2838 
2839 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
2840   fcmp2int(dst, unordered_is_less, 1, true, true);
2841 }
2842 
2843 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
2844   fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
2845   Label L;
2846   if (unordered_is_less) {
2847     movl(dst, -1);
2848     jcc(Assembler::parity, L);
2849     jcc(Assembler::below , L);
2850     movl(dst, 0);
2851     jcc(Assembler::equal , L);
2852     increment(dst);
2853   } else { // unordered is greater
2854     movl(dst, 1);
2855     jcc(Assembler::parity, L);
2856     jcc(Assembler::above , L);
2857     movl(dst, 0);
2858     jcc(Assembler::equal , L);
2859     decrementl(dst);
2860   }
2861   bind(L);
2862 }
2863 
2864 void MacroAssembler::fld_d(AddressLiteral src) {
2865   fld_d(as_Address(src));
2866 }
2867 
2868 void MacroAssembler::fld_s(AddressLiteral src) {
2869   fld_s(as_Address(src));
2870 }
2871 
2872 void MacroAssembler::fld_x(AddressLiteral src) {
2873   Assembler::fld_x(as_Address(src));
2874 }
2875 
2876 void MacroAssembler::fldcw(AddressLiteral src) {
2877   Assembler::fldcw(as_Address(src));
2878 }
2879 
2880 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
2881   if (reachable(src)) {
2882     Assembler::mulpd(dst, as_Address(src));
2883   } else {
2884     lea(rscratch1, src);
2885     Assembler::mulpd(dst, Address(rscratch1, 0));
2886   }
2887 }
2888 
2889 void MacroAssembler::increase_precision() {
2890   subptr(rsp, BytesPerWord);
2891   fnstcw(Address(rsp, 0));
2892   movl(rax, Address(rsp, 0));
2893   orl(rax, 0x300);
2894   push(rax);
2895   fldcw(Address(rsp, 0));
2896   pop(rax);
2897 }
2898 
2899 void MacroAssembler::restore_precision() {
2900   fldcw(Address(rsp, 0));
2901   addptr(rsp, BytesPerWord);
2902 }
2903 
2904 void MacroAssembler::fpop() {
2905   ffree();
2906   fincstp();
2907 }
2908 
2909 void MacroAssembler::load_float(Address src) {
2910   if (UseSSE >= 1) {
2911     movflt(xmm0, src);
2912   } else {
2913     LP64_ONLY(ShouldNotReachHere());
2914     NOT_LP64(fld_s(src));
2915   }
2916 }
2917 
2918 void MacroAssembler::store_float(Address dst) {
2919   if (UseSSE >= 1) {
2920     movflt(dst, xmm0);
2921   } else {
2922     LP64_ONLY(ShouldNotReachHere());
2923     NOT_LP64(fstp_s(dst));
2924   }
2925 }
2926 
2927 void MacroAssembler::load_double(Address src) {
2928   if (UseSSE >= 2) {
2929     movdbl(xmm0, src);
2930   } else {
2931     LP64_ONLY(ShouldNotReachHere());
2932     NOT_LP64(fld_d(src));
2933   }
2934 }
2935 
2936 void MacroAssembler::store_double(Address dst) {
2937   if (UseSSE >= 2) {
2938     movdbl(dst, xmm0);
2939   } else {
2940     LP64_ONLY(ShouldNotReachHere());
2941     NOT_LP64(fstp_d(dst));
2942   }
2943 }
2944 
2945 void MacroAssembler::fremr(Register tmp) {
2946   save_rax(tmp);
2947   { Label L;
2948     bind(L);
2949     fprem();
2950     fwait(); fnstsw_ax();
2951 #ifdef _LP64
2952     testl(rax, 0x400);
2953     jcc(Assembler::notEqual, L);
2954 #else
2955     sahf();
2956     jcc(Assembler::parity, L);
2957 #endif // _LP64
2958   }
2959   restore_rax(tmp);
2960   // Result is in ST0.
2961   // Note: fxch & fpop to get rid of ST1
2962   // (otherwise FPU stack could overflow eventually)
2963   fxch(1);
2964   fpop();
2965 }
2966 
2967 // dst = c = a * b + c
2968 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2969   Assembler::vfmadd231sd(c, a, b);
2970   if (dst != c) {
2971     movdbl(dst, c);
2972   }
2973 }
2974 
2975 // dst = c = a * b + c
2976 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2977   Assembler::vfmadd231ss(c, a, b);
2978   if (dst != c) {
2979     movflt(dst, c);
2980   }
2981 }
2982 
2983 // dst = c = a * b + c
2984 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2985   Assembler::vfmadd231pd(c, a, b, vector_len);
2986   if (dst != c) {
2987     vmovdqu(dst, c);
2988   }
2989 }
2990 
2991 // dst = c = a * b + c
2992 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2993   Assembler::vfmadd231ps(c, a, b, vector_len);
2994   if (dst != c) {
2995     vmovdqu(dst, c);
2996   }
2997 }
2998 
2999 // dst = c = a * b + c
3000 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
3001   Assembler::vfmadd231pd(c, a, b, vector_len);
3002   if (dst != c) {
3003     vmovdqu(dst, c);
3004   }
3005 }
3006 
3007 // dst = c = a * b + c
3008 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
3009   Assembler::vfmadd231ps(c, a, b, vector_len);
3010   if (dst != c) {
3011     vmovdqu(dst, c);
3012   }
3013 }
3014 
3015 void MacroAssembler::incrementl(AddressLiteral dst) {
3016   if (reachable(dst)) {
3017     incrementl(as_Address(dst));
3018   } else {
3019     lea(rscratch1, dst);
3020     incrementl(Address(rscratch1, 0));
3021   }
3022 }
3023 
3024 void MacroAssembler::incrementl(ArrayAddress dst) {
3025   incrementl(as_Address(dst));
3026 }
3027 
3028 void MacroAssembler::incrementl(Register reg, int value) {
3029   if (value == min_jint) {addl(reg, value) ; return; }
3030   if (value <  0) { decrementl(reg, -value); return; }
3031   if (value == 0) {                        ; return; }
3032   if (value == 1 && UseIncDec) { incl(reg) ; return; }
3033   /* else */      { addl(reg, value)       ; return; }
3034 }
3035 
3036 void MacroAssembler::incrementl(Address dst, int value) {
3037   if (value == min_jint) {addl(dst, value) ; return; }
3038   if (value <  0) { decrementl(dst, -value); return; }
3039   if (value == 0) {                        ; return; }
3040   if (value == 1 && UseIncDec) { incl(dst) ; return; }
3041   /* else */      { addl(dst, value)       ; return; }
3042 }
3043 
3044 void MacroAssembler::jump(AddressLiteral dst) {
3045   if (reachable(dst)) {
3046     jmp_literal(dst.target(), dst.rspec());
3047   } else {
3048     lea(rscratch1, dst);
3049     jmp(rscratch1);
3050   }
3051 }
3052 
3053 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
3054   if (reachable(dst)) {
3055     InstructionMark im(this);
3056     relocate(dst.reloc());
3057     const int short_size = 2;
3058     const int long_size = 6;
3059     int offs = (intptr_t)dst.target() - ((intptr_t)pc());
3060     if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
3061       // 0111 tttn #8-bit disp
3062       emit_int8(0x70 | cc);
3063       emit_int8((offs - short_size) & 0xFF);
3064     } else {
3065       // 0000 1111 1000 tttn #32-bit disp
3066       emit_int8(0x0F);
3067       emit_int8((unsigned char)(0x80 | cc));
3068       emit_int32(offs - long_size);
3069     }
3070   } else {
3071 #ifdef ASSERT
3072     warning("reversing conditional branch");
3073 #endif /* ASSERT */
3074     Label skip;
3075     jccb(reverse[cc], skip);
3076     lea(rscratch1, dst);
3077     Assembler::jmp(rscratch1);
3078     bind(skip);
3079   }
3080 }
3081 
3082 void MacroAssembler::ldmxcsr(AddressLiteral src) {
3083   if (reachable(src)) {
3084     Assembler::ldmxcsr(as_Address(src));
3085   } else {
3086     lea(rscratch1, src);
3087     Assembler::ldmxcsr(Address(rscratch1, 0));
3088   }
3089 }
3090 
3091 int MacroAssembler::load_signed_byte(Register dst, Address src) {
3092   int off;
3093   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3094     off = offset();
3095     movsbl(dst, src); // movsxb
3096   } else {
3097     off = load_unsigned_byte(dst, src);
3098     shll(dst, 24);
3099     sarl(dst, 24);
3100   }
3101   return off;
3102 }
3103 
3104 // Note: load_signed_short used to be called load_signed_word.
3105 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
3106 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
3107 // The term "word" in HotSpot means a 32- or 64-bit machine word.
3108 int MacroAssembler::load_signed_short(Register dst, Address src) {
3109   int off;
3110   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3111     // This is dubious to me since it seems safe to do a signed 16 => 64 bit
3112     // version but this is what 64bit has always done. This seems to imply
3113     // that users are only using 32bits worth.
3114     off = offset();
3115     movswl(dst, src); // movsxw
3116   } else {
3117     off = load_unsigned_short(dst, src);
3118     shll(dst, 16);
3119     sarl(dst, 16);
3120   }
3121   return off;
3122 }
3123 
3124 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
3125   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3126   // and "3.9 Partial Register Penalties", p. 22).
3127   int off;
3128   if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
3129     off = offset();
3130     movzbl(dst, src); // movzxb
3131   } else {
3132     xorl(dst, dst);
3133     off = offset();
3134     movb(dst, src);
3135   }
3136   return off;
3137 }
3138 
3139 // Note: load_unsigned_short used to be called load_unsigned_word.
3140 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
3141   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3142   // and "3.9 Partial Register Penalties", p. 22).
3143   int off;
3144   if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
3145     off = offset();
3146     movzwl(dst, src); // movzxw
3147   } else {
3148     xorl(dst, dst);
3149     off = offset();
3150     movw(dst, src);
3151   }
3152   return off;
3153 }
3154 
3155 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
3156   switch (size_in_bytes) {
3157 #ifndef _LP64
3158   case  8:
3159     assert(dst2 != noreg, "second dest register required");
3160     movl(dst,  src);
3161     movl(dst2, src.plus_disp(BytesPerInt));
3162     break;
3163 #else
3164   case  8:  movq(dst, src); break;
3165 #endif
3166   case  4:  movl(dst, src); break;
3167   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
3168   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
3169   default:  ShouldNotReachHere();
3170   }
3171 }
3172 
3173 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
3174   switch (size_in_bytes) {
3175 #ifndef _LP64
3176   case  8:
3177     assert(src2 != noreg, "second source register required");
3178     movl(dst,                        src);
3179     movl(dst.plus_disp(BytesPerInt), src2);
3180     break;
3181 #else
3182   case  8:  movq(dst, src); break;
3183 #endif
3184   case  4:  movl(dst, src); break;
3185   case  2:  movw(dst, src); break;
3186   case  1:  movb(dst, src); break;
3187   default:  ShouldNotReachHere();
3188   }
3189 }
3190 
3191 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
3192   if (reachable(dst)) {
3193     movl(as_Address(dst), src);
3194   } else {
3195     lea(rscratch1, dst);
3196     movl(Address(rscratch1, 0), src);
3197   }
3198 }
3199 
3200 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
3201   if (reachable(src)) {
3202     movl(dst, as_Address(src));
3203   } else {
3204     lea(rscratch1, src);
3205     movl(dst, Address(rscratch1, 0));
3206   }
3207 }
3208 
3209 // C++ bool manipulation
3210 
3211 void MacroAssembler::movbool(Register dst, Address src) {
3212   if(sizeof(bool) == 1)
3213     movb(dst, src);
3214   else if(sizeof(bool) == 2)
3215     movw(dst, src);
3216   else if(sizeof(bool) == 4)
3217     movl(dst, src);
3218   else
3219     // unsupported
3220     ShouldNotReachHere();
3221 }
3222 
3223 void MacroAssembler::movbool(Address dst, bool boolconst) {
3224   if(sizeof(bool) == 1)
3225     movb(dst, (int) boolconst);
3226   else if(sizeof(bool) == 2)
3227     movw(dst, (int) boolconst);
3228   else if(sizeof(bool) == 4)
3229     movl(dst, (int) boolconst);
3230   else
3231     // unsupported
3232     ShouldNotReachHere();
3233 }
3234 
3235 void MacroAssembler::movbool(Address dst, Register src) {
3236   if(sizeof(bool) == 1)
3237     movb(dst, src);
3238   else if(sizeof(bool) == 2)
3239     movw(dst, src);
3240   else if(sizeof(bool) == 4)
3241     movl(dst, src);
3242   else
3243     // unsupported
3244     ShouldNotReachHere();
3245 }
3246 
3247 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
3248   movb(as_Address(dst), src);
3249 }
3250 
3251 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
3252   if (reachable(src)) {
3253     movdl(dst, as_Address(src));
3254   } else {
3255     lea(rscratch1, src);
3256     movdl(dst, Address(rscratch1, 0));
3257   }
3258 }
3259 
3260 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
3261   if (reachable(src)) {
3262     movq(dst, as_Address(src));
3263   } else {
3264     lea(rscratch1, src);
3265     movq(dst, Address(rscratch1, 0));
3266   }
3267 }
3268 
3269 #ifdef COMPILER2
3270 void MacroAssembler::setvectmask(Register dst, Register src) {
3271   guarantee(PostLoopMultiversioning, "must be");
3272   Assembler::movl(dst, 1);
3273   Assembler::shlxl(dst, dst, src);
3274   Assembler::decl(dst);
3275   Assembler::kmovdl(k1, dst);
3276   Assembler::movl(dst, src);
3277 }
3278 
3279 void MacroAssembler::restorevectmask() {
3280   guarantee(PostLoopMultiversioning, "must be");
3281   Assembler::knotwl(k1, k0);
3282 }
3283 #endif // COMPILER2
3284 
3285 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
3286   if (reachable(src)) {
3287     if (UseXmmLoadAndClearUpper) {
3288       movsd (dst, as_Address(src));
3289     } else {
3290       movlpd(dst, as_Address(src));
3291     }
3292   } else {
3293     lea(rscratch1, src);
3294     if (UseXmmLoadAndClearUpper) {
3295       movsd (dst, Address(rscratch1, 0));
3296     } else {
3297       movlpd(dst, Address(rscratch1, 0));
3298     }
3299   }
3300 }
3301 
3302 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
3303   if (reachable(src)) {
3304     movss(dst, as_Address(src));
3305   } else {
3306     lea(rscratch1, src);
3307     movss(dst, Address(rscratch1, 0));
3308   }
3309 }
3310 
3311 void MacroAssembler::movptr(Register dst, Register src) {
3312   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3313 }
3314 
3315 void MacroAssembler::movptr(Register dst, Address src) {
3316   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3317 }
3318 
3319 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
3320 void MacroAssembler::movptr(Register dst, intptr_t src) {
3321   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
3322 }
3323 
3324 void MacroAssembler::movptr(Address dst, Register src) {
3325   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3326 }
3327 
3328 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
3329     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3330     Assembler::movdqu(dst, src);
3331 }
3332 
3333 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
3334     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3335     Assembler::movdqu(dst, src);
3336 }
3337 
3338 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
3339     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3340     Assembler::movdqu(dst, src);
3341 }
3342 
3343 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
3344   if (reachable(src)) {
3345     movdqu(dst, as_Address(src));
3346   } else {
3347     lea(scratchReg, src);
3348     movdqu(dst, Address(scratchReg, 0));
3349   }
3350 }
3351 
3352 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
3353     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3354     Assembler::vmovdqu(dst, src);
3355 }
3356 
3357 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
3358     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3359     Assembler::vmovdqu(dst, src);
3360 }
3361 
3362 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
3363     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3364     Assembler::vmovdqu(dst, src);
3365 }
3366 
3367 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src) {
3368   if (reachable(src)) {
3369     vmovdqu(dst, as_Address(src));
3370   }
3371   else {
3372     lea(rscratch1, src);
3373     vmovdqu(dst, Address(rscratch1, 0));
3374   }
3375 }
3376 
3377 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3378   if (reachable(src)) {
3379     Assembler::evmovdquq(dst, as_Address(src), vector_len);
3380   } else {
3381     lea(rscratch, src);
3382     Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
3383   }
3384 }
3385 
3386 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
3387   if (reachable(src)) {
3388     Assembler::movdqa(dst, as_Address(src));
3389   } else {
3390     lea(rscratch1, src);
3391     Assembler::movdqa(dst, Address(rscratch1, 0));
3392   }
3393 }
3394 
3395 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
3396   if (reachable(src)) {
3397     Assembler::movsd(dst, as_Address(src));
3398   } else {
3399     lea(rscratch1, src);
3400     Assembler::movsd(dst, Address(rscratch1, 0));
3401   }
3402 }
3403 
3404 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
3405   if (reachable(src)) {
3406     Assembler::movss(dst, as_Address(src));
3407   } else {
3408     lea(rscratch1, src);
3409     Assembler::movss(dst, Address(rscratch1, 0));
3410   }
3411 }
3412 
3413 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
3414   if (reachable(src)) {
3415     Assembler::mulsd(dst, as_Address(src));
3416   } else {
3417     lea(rscratch1, src);
3418     Assembler::mulsd(dst, Address(rscratch1, 0));
3419   }
3420 }
3421 
3422 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
3423   if (reachable(src)) {
3424     Assembler::mulss(dst, as_Address(src));
3425   } else {
3426     lea(rscratch1, src);
3427     Assembler::mulss(dst, Address(rscratch1, 0));
3428   }
3429 }
3430 
3431 void MacroAssembler::null_check(Register reg, int offset) {
3432   if (needs_explicit_null_check(offset)) {
3433     // provoke OS NULL exception if reg = NULL by
3434     // accessing M[reg] w/o changing any (non-CC) registers
3435     // NOTE: cmpl is plenty here to provoke a segv
3436     cmpptr(rax, Address(reg, 0));
3437     // Note: should probably use testl(rax, Address(reg, 0));
3438     //       may be shorter code (however, this version of
3439     //       testl needs to be implemented first)
3440   } else {
3441     // nothing to do, (later) access of M[reg + offset]
3442     // will provoke OS NULL exception if reg = NULL
3443   }
3444 }
3445 
3446 void MacroAssembler::os_breakpoint() {
3447   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
3448   // (e.g., MSVC can't call ps() otherwise)
3449   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
3450 }
3451 
3452 void MacroAssembler::unimplemented(const char* what) {
3453   const char* buf = NULL;
3454   {
3455     ResourceMark rm;
3456     stringStream ss;
3457     ss.print("unimplemented: %s", what);
3458     buf = code_string(ss.as_string());
3459   }
3460   stop(buf);
3461 }
3462 
3463 #ifdef _LP64
3464 #define XSTATE_BV 0x200
3465 #endif
3466 
3467 void MacroAssembler::pop_CPU_state() {
3468   pop_FPU_state();
3469   pop_IU_state();
3470 }
3471 
3472 void MacroAssembler::pop_FPU_state() {
3473 #ifndef _LP64
3474   frstor(Address(rsp, 0));
3475 #else
3476   fxrstor(Address(rsp, 0));
3477 #endif
3478   addptr(rsp, FPUStateSizeInWords * wordSize);
3479 }
3480 
3481 void MacroAssembler::pop_IU_state() {
3482   popa();
3483   LP64_ONLY(addq(rsp, 8));
3484   popf();
3485 }
3486 
3487 // Save Integer and Float state
3488 // Warning: Stack must be 16 byte aligned (64bit)
3489 void MacroAssembler::push_CPU_state() {
3490   push_IU_state();
3491   push_FPU_state();
3492 }
3493 
3494 void MacroAssembler::push_FPU_state() {
3495   subptr(rsp, FPUStateSizeInWords * wordSize);
3496 #ifndef _LP64
3497   fnsave(Address(rsp, 0));
3498   fwait();
3499 #else
3500   fxsave(Address(rsp, 0));
3501 #endif // LP64
3502 }
3503 
3504 void MacroAssembler::push_IU_state() {
3505   // Push flags first because pusha kills them
3506   pushf();
3507   // Make sure rsp stays 16-byte aligned
3508   LP64_ONLY(subq(rsp, 8));
3509   pusha();
3510 }
3511 
3512 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register
3513   if (!java_thread->is_valid()) {
3514     java_thread = rdi;
3515     get_thread(java_thread);
3516   }
3517   // we must set sp to zero to clear frame
3518   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
3519   if (clear_fp) {
3520     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3521   }
3522 
3523   // Always clear the pc because it could have been set by make_walkable()
3524   movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
3525 
3526   vzeroupper();
3527 }
3528 
3529 void MacroAssembler::restore_rax(Register tmp) {
3530   if (tmp == noreg) pop(rax);
3531   else if (tmp != rax) mov(rax, tmp);
3532 }
3533 
3534 void MacroAssembler::round_to(Register reg, int modulus) {
3535   addptr(reg, modulus - 1);
3536   andptr(reg, -modulus);
3537 }
3538 
3539 void MacroAssembler::save_rax(Register tmp) {
3540   if (tmp == noreg) push(rax);
3541   else if (tmp != rax) mov(tmp, rax);
3542 }
3543 
3544 // Write serialization page so VM thread can do a pseudo remote membar.
3545 // We use the current thread pointer to calculate a thread specific
3546 // offset to write to within the page. This minimizes bus traffic
3547 // due to cache line collision.
3548 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
3549   movl(tmp, thread);
3550   shrl(tmp, os::get_serialize_page_shift_count());
3551   andl(tmp, (os::vm_page_size() - sizeof(int)));
3552 
3553   Address index(noreg, tmp, Address::times_1);
3554   ExternalAddress page(os::get_memory_serialize_page());
3555 
3556   // Size of store must match masking code above
3557   movl(as_Address(ArrayAddress(page, index)), tmp);
3558 }
3559 
3560 void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, Register temp_reg) {
3561   if (SafepointMechanism::uses_thread_local_poll()) {
3562 #ifdef _LP64
3563     assert(thread_reg == r15_thread, "should be");
3564 #else
3565     if (thread_reg == noreg) {
3566       thread_reg = temp_reg;
3567       get_thread(thread_reg);
3568     }
3569 #endif
3570     testb(Address(thread_reg, Thread::polling_page_offset()), SafepointMechanism::poll_bit());
3571     jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
3572   } else {
3573     cmp32(ExternalAddress(SafepointSynchronize::address_of_state()),
3574         SafepointSynchronize::_not_synchronized);
3575     jcc(Assembler::notEqual, slow_path);
3576   }
3577 }
3578 
3579 // Calls to C land
3580 //
3581 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
3582 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
3583 // has to be reset to 0. This is required to allow proper stack traversal.
3584 void MacroAssembler::set_last_Java_frame(Register java_thread,
3585                                          Register last_java_sp,
3586                                          Register last_java_fp,
3587                                          address  last_java_pc) {
3588   vzeroupper();
3589   // determine java_thread register
3590   if (!java_thread->is_valid()) {
3591     java_thread = rdi;
3592     get_thread(java_thread);
3593   }
3594   // determine last_java_sp register
3595   if (!last_java_sp->is_valid()) {
3596     last_java_sp = rsp;
3597   }
3598 
3599   // last_java_fp is optional
3600 
3601   if (last_java_fp->is_valid()) {
3602     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
3603   }
3604 
3605   // last_java_pc is optional
3606 
3607   if (last_java_pc != NULL) {
3608     lea(Address(java_thread,
3609                  JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
3610         InternalAddress(last_java_pc));
3611 
3612   }
3613   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
3614 }
3615 
3616 void MacroAssembler::shlptr(Register dst, int imm8) {
3617   LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
3618 }
3619 
3620 void MacroAssembler::shrptr(Register dst, int imm8) {
3621   LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
3622 }
3623 
3624 void MacroAssembler::sign_extend_byte(Register reg) {
3625   if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
3626     movsbl(reg, reg); // movsxb
3627   } else {
3628     shll(reg, 24);
3629     sarl(reg, 24);
3630   }
3631 }
3632 
3633 void MacroAssembler::sign_extend_short(Register reg) {
3634   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3635     movswl(reg, reg); // movsxw
3636   } else {
3637     shll(reg, 16);
3638     sarl(reg, 16);
3639   }
3640 }
3641 
3642 void MacroAssembler::testl(Register dst, AddressLiteral src) {
3643   assert(reachable(src), "Address should be reachable");
3644   testl(dst, as_Address(src));
3645 }
3646 
3647 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
3648   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3649   Assembler::pcmpeqb(dst, src);
3650 }
3651 
3652 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
3653   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3654   Assembler::pcmpeqw(dst, src);
3655 }
3656 
3657 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
3658   assert((dst->encoding() < 16),"XMM register should be 0-15");
3659   Assembler::pcmpestri(dst, src, imm8);
3660 }
3661 
3662 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
3663   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3664   Assembler::pcmpestri(dst, src, imm8);
3665 }
3666 
3667 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
3668   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3669   Assembler::pmovzxbw(dst, src);
3670 }
3671 
3672 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
3673   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3674   Assembler::pmovzxbw(dst, src);
3675 }
3676 
3677 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
3678   assert((src->encoding() < 16),"XMM register should be 0-15");
3679   Assembler::pmovmskb(dst, src);
3680 }
3681 
3682 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
3683   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3684   Assembler::ptest(dst, src);
3685 }
3686 
3687 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
3688   if (reachable(src)) {
3689     Assembler::sqrtsd(dst, as_Address(src));
3690   } else {
3691     lea(rscratch1, src);
3692     Assembler::sqrtsd(dst, Address(rscratch1, 0));
3693   }
3694 }
3695 
3696 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
3697   if (reachable(src)) {
3698     Assembler::sqrtss(dst, as_Address(src));
3699   } else {
3700     lea(rscratch1, src);
3701     Assembler::sqrtss(dst, Address(rscratch1, 0));
3702   }
3703 }
3704 
3705 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
3706   if (reachable(src)) {
3707     Assembler::subsd(dst, as_Address(src));
3708   } else {
3709     lea(rscratch1, src);
3710     Assembler::subsd(dst, Address(rscratch1, 0));
3711   }
3712 }
3713 
3714 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
3715   if (reachable(src)) {
3716     Assembler::subss(dst, as_Address(src));
3717   } else {
3718     lea(rscratch1, src);
3719     Assembler::subss(dst, Address(rscratch1, 0));
3720   }
3721 }
3722 
3723 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
3724   if (reachable(src)) {
3725     Assembler::ucomisd(dst, as_Address(src));
3726   } else {
3727     lea(rscratch1, src);
3728     Assembler::ucomisd(dst, Address(rscratch1, 0));
3729   }
3730 }
3731 
3732 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
3733   if (reachable(src)) {
3734     Assembler::ucomiss(dst, as_Address(src));
3735   } else {
3736     lea(rscratch1, src);
3737     Assembler::ucomiss(dst, Address(rscratch1, 0));
3738   }
3739 }
3740 
3741 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
3742   // Used in sign-bit flipping with aligned address.
3743   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3744   if (reachable(src)) {
3745     Assembler::xorpd(dst, as_Address(src));
3746   } else {
3747     lea(rscratch1, src);
3748     Assembler::xorpd(dst, Address(rscratch1, 0));
3749   }
3750 }
3751 
3752 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
3753   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3754     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3755   }
3756   else {
3757     Assembler::xorpd(dst, src);
3758   }
3759 }
3760 
3761 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
3762   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3763     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3764   } else {
3765     Assembler::xorps(dst, src);
3766   }
3767 }
3768 
3769 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
3770   // Used in sign-bit flipping with aligned address.
3771   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3772   if (reachable(src)) {
3773     Assembler::xorps(dst, as_Address(src));
3774   } else {
3775     lea(rscratch1, src);
3776     Assembler::xorps(dst, Address(rscratch1, 0));
3777   }
3778 }
3779 
3780 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
3781   // Used in sign-bit flipping with aligned address.
3782   bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3783   assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3784   if (reachable(src)) {
3785     Assembler::pshufb(dst, as_Address(src));
3786   } else {
3787     lea(rscratch1, src);
3788     Assembler::pshufb(dst, Address(rscratch1, 0));
3789   }
3790 }
3791 
3792 // AVX 3-operands instructions
3793 
3794 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3795   if (reachable(src)) {
3796     vaddsd(dst, nds, as_Address(src));
3797   } else {
3798     lea(rscratch1, src);
3799     vaddsd(dst, nds, Address(rscratch1, 0));
3800   }
3801 }
3802 
3803 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3804   if (reachable(src)) {
3805     vaddss(dst, nds, as_Address(src));
3806   } else {
3807     lea(rscratch1, src);
3808     vaddss(dst, nds, Address(rscratch1, 0));
3809   }
3810 }
3811 
3812 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3813   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3814   vandps(dst, nds, negate_field, vector_len);
3815 }
3816 
3817 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3818   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3819   vandpd(dst, nds, negate_field, vector_len);
3820 }
3821 
3822 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3823   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3824   Assembler::vpaddb(dst, nds, src, vector_len);
3825 }
3826 
3827 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3828   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3829   Assembler::vpaddb(dst, nds, src, vector_len);
3830 }
3831 
3832 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3833   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3834   Assembler::vpaddw(dst, nds, src, vector_len);
3835 }
3836 
3837 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3838   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3839   Assembler::vpaddw(dst, nds, src, vector_len);
3840 }
3841 
3842 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
3843   if (reachable(src)) {
3844     Assembler::vpand(dst, nds, as_Address(src), vector_len);
3845   } else {
3846     lea(rscratch1, src);
3847     Assembler::vpand(dst, nds, Address(rscratch1, 0), vector_len);
3848   }
3849 }
3850 
3851 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
3852   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3853   Assembler::vpbroadcastw(dst, src, vector_len);
3854 }
3855 
3856 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3857   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3858   Assembler::vpcmpeqb(dst, nds, src, vector_len);
3859 }
3860 
3861 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3862   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3863   Assembler::vpcmpeqw(dst, nds, src, vector_len);
3864 }
3865 
3866 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3867   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3868   Assembler::vpmovzxbw(dst, src, vector_len);
3869 }
3870 
3871 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src) {
3872   assert((src->encoding() < 16),"XMM register should be 0-15");
3873   Assembler::vpmovmskb(dst, src);
3874 }
3875 
3876 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3877   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3878   Assembler::vpmullw(dst, nds, src, vector_len);
3879 }
3880 
3881 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3882   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3883   Assembler::vpmullw(dst, nds, src, vector_len);
3884 }
3885 
3886 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3887   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3888   Assembler::vpsubb(dst, nds, src, vector_len);
3889 }
3890 
3891 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3892   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3893   Assembler::vpsubb(dst, nds, src, vector_len);
3894 }
3895 
3896 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3897   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3898   Assembler::vpsubw(dst, nds, src, vector_len);
3899 }
3900 
3901 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3902   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3903   Assembler::vpsubw(dst, nds, src, vector_len);
3904 }
3905 
3906 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3907   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3908   Assembler::vpsraw(dst, nds, shift, vector_len);
3909 }
3910 
3911 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3912   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3913   Assembler::vpsraw(dst, nds, shift, vector_len);
3914 }
3915 
3916 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3917   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3918   Assembler::vpsrlw(dst, nds, shift, vector_len);
3919 }
3920 
3921 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3922   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3923   Assembler::vpsrlw(dst, nds, shift, vector_len);
3924 }
3925 
3926 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3927   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3928   Assembler::vpsllw(dst, nds, shift, vector_len);
3929 }
3930 
3931 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3932   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3933   Assembler::vpsllw(dst, nds, shift, vector_len);
3934 }
3935 
3936 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3937   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3938   Assembler::vptest(dst, src);
3939 }
3940 
3941 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3942   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3943   Assembler::punpcklbw(dst, src);
3944 }
3945 
3946 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3947   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3948   Assembler::pshufd(dst, src, mode);
3949 }
3950 
3951 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3952   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3953   Assembler::pshuflw(dst, src, mode);
3954 }
3955 
3956 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
3957   if (reachable(src)) {
3958     vandpd(dst, nds, as_Address(src), vector_len);
3959   } else {
3960     lea(rscratch1, src);
3961     vandpd(dst, nds, Address(rscratch1, 0), vector_len);
3962   }
3963 }
3964 
3965 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
3966   if (reachable(src)) {
3967     vandps(dst, nds, as_Address(src), vector_len);
3968   } else {
3969     lea(rscratch1, src);
3970     vandps(dst, nds, Address(rscratch1, 0), vector_len);
3971   }
3972 }
3973 
3974 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3975   if (reachable(src)) {
3976     vdivsd(dst, nds, as_Address(src));
3977   } else {
3978     lea(rscratch1, src);
3979     vdivsd(dst, nds, Address(rscratch1, 0));
3980   }
3981 }
3982 
3983 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3984   if (reachable(src)) {
3985     vdivss(dst, nds, as_Address(src));
3986   } else {
3987     lea(rscratch1, src);
3988     vdivss(dst, nds, Address(rscratch1, 0));
3989   }
3990 }
3991 
3992 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3993   if (reachable(src)) {
3994     vmulsd(dst, nds, as_Address(src));
3995   } else {
3996     lea(rscratch1, src);
3997     vmulsd(dst, nds, Address(rscratch1, 0));
3998   }
3999 }
4000 
4001 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4002   if (reachable(src)) {
4003     vmulss(dst, nds, as_Address(src));
4004   } else {
4005     lea(rscratch1, src);
4006     vmulss(dst, nds, Address(rscratch1, 0));
4007   }
4008 }
4009 
4010 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4011   if (reachable(src)) {
4012     vsubsd(dst, nds, as_Address(src));
4013   } else {
4014     lea(rscratch1, src);
4015     vsubsd(dst, nds, Address(rscratch1, 0));
4016   }
4017 }
4018 
4019 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4020   if (reachable(src)) {
4021     vsubss(dst, nds, as_Address(src));
4022   } else {
4023     lea(rscratch1, src);
4024     vsubss(dst, nds, Address(rscratch1, 0));
4025   }
4026 }
4027 
4028 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4029   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
4030   vxorps(dst, nds, src, Assembler::AVX_128bit);
4031 }
4032 
4033 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4034   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
4035   vxorpd(dst, nds, src, Assembler::AVX_128bit);
4036 }
4037 
4038 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4039   if (reachable(src)) {
4040     vxorpd(dst, nds, as_Address(src), vector_len);
4041   } else {
4042     lea(rscratch1, src);
4043     vxorpd(dst, nds, Address(rscratch1, 0), vector_len);
4044   }
4045 }
4046 
4047 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4048   if (reachable(src)) {
4049     vxorps(dst, nds, as_Address(src), vector_len);
4050   } else {
4051     lea(rscratch1, src);
4052     vxorps(dst, nds, Address(rscratch1, 0), vector_len);
4053   }
4054 }
4055 
4056 void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
4057   const int32_t inverted_jweak_mask = ~static_cast<int32_t>(JNIHandles::weak_tag_mask);
4058   STATIC_ASSERT(inverted_jweak_mask == -2); // otherwise check this code
4059   // The inverted mask is sign-extended
4060   andptr(possibly_jweak, inverted_jweak_mask);
4061 }
4062 
4063 void MacroAssembler::resolve_jobject(Register value,
4064                                      Register thread,
4065                                      Register tmp) {
4066   assert_different_registers(value, thread, tmp);
4067   Label done, not_weak;
4068   testptr(value, value);
4069   jcc(Assembler::zero, done);                // Use NULL as-is.
4070   testptr(value, JNIHandles::weak_tag_mask); // Test for jweak tag.
4071   jcc(Assembler::zero, not_weak);
4072   // Resolve jweak.
4073   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
4074                  value, Address(value, -JNIHandles::weak_tag_value), tmp, thread);
4075   verify_oop(value);
4076   jmp(done);
4077   bind(not_weak);
4078   // Resolve (untagged) jobject.
4079   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
4080   verify_oop(value);
4081   bind(done);
4082 }
4083 
4084 void MacroAssembler::subptr(Register dst, int32_t imm32) {
4085   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
4086 }
4087 
4088 // Force generation of a 4 byte immediate value even if it fits into 8bit
4089 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
4090   LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
4091 }
4092 
4093 void MacroAssembler::subptr(Register dst, Register src) {
4094   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
4095 }
4096 
4097 // C++ bool manipulation
4098 void MacroAssembler::testbool(Register dst) {
4099   if(sizeof(bool) == 1)
4100     testb(dst, 0xff);
4101   else if(sizeof(bool) == 2) {
4102     // testw implementation needed for two byte bools
4103     ShouldNotReachHere();
4104   } else if(sizeof(bool) == 4)
4105     testl(dst, dst);
4106   else
4107     // unsupported
4108     ShouldNotReachHere();
4109 }
4110 
4111 void MacroAssembler::testptr(Register dst, Register src) {
4112   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
4113 }
4114 
4115 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4116 void MacroAssembler::tlab_allocate(Register thread, Register obj,
4117                                    Register var_size_in_bytes,
4118                                    int con_size_in_bytes,
4119                                    Register t1,
4120                                    Register t2,
4121                                    Label& slow_case) {
4122   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4123   bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4124 }
4125 
4126 // Defines obj, preserves var_size_in_bytes
4127 void MacroAssembler::eden_allocate(Register thread, Register obj,
4128                                    Register var_size_in_bytes,
4129                                    int con_size_in_bytes,
4130                                    Register t1,
4131                                    Label& slow_case) {
4132   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4133   bs->eden_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4134 }
4135 
4136 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
4137 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
4138   assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
4139   assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
4140   Label done;
4141 
4142   testptr(length_in_bytes, length_in_bytes);
4143   jcc(Assembler::zero, done);
4144 
4145   // initialize topmost word, divide index by 2, check if odd and test if zero
4146   // note: for the remaining code to work, index must be a multiple of BytesPerWord
4147 #ifdef ASSERT
4148   {
4149     Label L;
4150     testptr(length_in_bytes, BytesPerWord - 1);
4151     jcc(Assembler::zero, L);
4152     stop("length must be a multiple of BytesPerWord");
4153     bind(L);
4154   }
4155 #endif
4156   Register index = length_in_bytes;
4157   xorptr(temp, temp);    // use _zero reg to clear memory (shorter code)
4158   if (UseIncDec) {
4159     shrptr(index, 3);  // divide by 8/16 and set carry flag if bit 2 was set
4160   } else {
4161     shrptr(index, 2);  // use 2 instructions to avoid partial flag stall
4162     shrptr(index, 1);
4163   }
4164 #ifndef _LP64
4165   // index could have not been a multiple of 8 (i.e., bit 2 was set)
4166   {
4167     Label even;
4168     // note: if index was a multiple of 8, then it cannot
4169     //       be 0 now otherwise it must have been 0 before
4170     //       => if it is even, we don't need to check for 0 again
4171     jcc(Assembler::carryClear, even);
4172     // clear topmost word (no jump would be needed if conditional assignment worked here)
4173     movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
4174     // index could be 0 now, must check again
4175     jcc(Assembler::zero, done);
4176     bind(even);
4177   }
4178 #endif // !_LP64
4179   // initialize remaining object fields: index is a multiple of 2 now
4180   {
4181     Label loop;
4182     bind(loop);
4183     movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
4184     NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
4185     decrement(index);
4186     jcc(Assembler::notZero, loop);
4187   }
4188 
4189   bind(done);
4190 }
4191 
4192 // Look up the method for a megamorphic invokeinterface call.
4193 // The target method is determined by <intf_klass, itable_index>.
4194 // The receiver klass is in recv_klass.
4195 // On success, the result will be in method_result, and execution falls through.
4196 // On failure, execution transfers to the given label.
4197 void MacroAssembler::lookup_interface_method(Register recv_klass,
4198                                              Register intf_klass,
4199                                              RegisterOrConstant itable_index,
4200                                              Register method_result,
4201                                              Register scan_temp,
4202                                              Label& L_no_such_interface,
4203                                              bool return_method) {
4204   assert_different_registers(recv_klass, intf_klass, scan_temp);
4205   assert_different_registers(method_result, intf_klass, scan_temp);
4206   assert(recv_klass != method_result || !return_method,
4207          "recv_klass can be destroyed when method isn't needed");
4208 
4209   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
4210          "caller must use same register for non-constant itable index as for method");
4211 
4212   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
4213   int vtable_base = in_bytes(Klass::vtable_start_offset());
4214   int itentry_off = itableMethodEntry::method_offset_in_bytes();
4215   int scan_step   = itableOffsetEntry::size() * wordSize;
4216   int vte_size    = vtableEntry::size_in_bytes();
4217   Address::ScaleFactor times_vte_scale = Address::times_ptr;
4218   assert(vte_size == wordSize, "else adjust times_vte_scale");
4219 
4220   movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
4221 
4222   // %%% Could store the aligned, prescaled offset in the klassoop.
4223   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
4224 
4225   if (return_method) {
4226     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
4227     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
4228     lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
4229   }
4230 
4231   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
4232   //   if (scan->interface() == intf) {
4233   //     result = (klass + scan->offset() + itable_index);
4234   //   }
4235   // }
4236   Label search, found_method;
4237 
4238   for (int peel = 1; peel >= 0; peel--) {
4239     movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
4240     cmpptr(intf_klass, method_result);
4241 
4242     if (peel) {
4243       jccb(Assembler::equal, found_method);
4244     } else {
4245       jccb(Assembler::notEqual, search);
4246       // (invert the test to fall through to found_method...)
4247     }
4248 
4249     if (!peel)  break;
4250 
4251     bind(search);
4252 
4253     // Check that the previous entry is non-null.  A null entry means that
4254     // the receiver class doesn't implement the interface, and wasn't the
4255     // same as when the caller was compiled.
4256     testptr(method_result, method_result);
4257     jcc(Assembler::zero, L_no_such_interface);
4258     addptr(scan_temp, scan_step);
4259   }
4260 
4261   bind(found_method);
4262 
4263   if (return_method) {
4264     // Got a hit.
4265     movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
4266     movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
4267   }
4268 }
4269 
4270 
4271 // virtual method calling
4272 void MacroAssembler::lookup_virtual_method(Register recv_klass,
4273                                            RegisterOrConstant vtable_index,
4274                                            Register method_result) {
4275   const int base = in_bytes(Klass::vtable_start_offset());
4276   assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
4277   Address vtable_entry_addr(recv_klass,
4278                             vtable_index, Address::times_ptr,
4279                             base + vtableEntry::method_offset_in_bytes());
4280   movptr(method_result, vtable_entry_addr);
4281 }
4282 
4283 
4284 void MacroAssembler::check_klass_subtype(Register sub_klass,
4285                            Register super_klass,
4286                            Register temp_reg,
4287                            Label& L_success) {
4288   Label L_failure;
4289   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
4290   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
4291   bind(L_failure);
4292 }
4293 
4294 
4295 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4296                                                    Register super_klass,
4297                                                    Register temp_reg,
4298                                                    Label* L_success,
4299                                                    Label* L_failure,
4300                                                    Label* L_slow_path,
4301                                         RegisterOrConstant super_check_offset) {
4302   assert_different_registers(sub_klass, super_klass, temp_reg);
4303   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
4304   if (super_check_offset.is_register()) {
4305     assert_different_registers(sub_klass, super_klass,
4306                                super_check_offset.as_register());
4307   } else if (must_load_sco) {
4308     assert(temp_reg != noreg, "supply either a temp or a register offset");
4309   }
4310 
4311   Label L_fallthrough;
4312   int label_nulls = 0;
4313   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
4314   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
4315   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
4316   assert(label_nulls <= 1, "at most one NULL in the batch");
4317 
4318   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4319   int sco_offset = in_bytes(Klass::super_check_offset_offset());
4320   Address super_check_offset_addr(super_klass, sco_offset);
4321 
4322   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
4323   // range of a jccb.  If this routine grows larger, reconsider at
4324   // least some of these.
4325 #define local_jcc(assembler_cond, label)                                \
4326   if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
4327   else                             jcc( assembler_cond, label) /*omit semi*/
4328 
4329   // Hacked jmp, which may only be used just before L_fallthrough.
4330 #define final_jmp(label)                                                \
4331   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
4332   else                            jmp(label)                /*omit semi*/
4333 
4334   // If the pointers are equal, we are done (e.g., String[] elements).
4335   // This self-check enables sharing of secondary supertype arrays among
4336   // non-primary types such as array-of-interface.  Otherwise, each such
4337   // type would need its own customized SSA.
4338   // We move this check to the front of the fast path because many
4339   // type checks are in fact trivially successful in this manner,
4340   // so we get a nicely predicted branch right at the start of the check.
4341   cmpptr(sub_klass, super_klass);
4342   local_jcc(Assembler::equal, *L_success);
4343 
4344   // Check the supertype display:
4345   if (must_load_sco) {
4346     // Positive movl does right thing on LP64.
4347     movl(temp_reg, super_check_offset_addr);
4348     super_check_offset = RegisterOrConstant(temp_reg);
4349   }
4350   Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
4351   cmpptr(super_klass, super_check_addr); // load displayed supertype
4352 
4353   // This check has worked decisively for primary supers.
4354   // Secondary supers are sought in the super_cache ('super_cache_addr').
4355   // (Secondary supers are interfaces and very deeply nested subtypes.)
4356   // This works in the same check above because of a tricky aliasing
4357   // between the super_cache and the primary super display elements.
4358   // (The 'super_check_addr' can address either, as the case requires.)
4359   // Note that the cache is updated below if it does not help us find
4360   // what we need immediately.
4361   // So if it was a primary super, we can just fail immediately.
4362   // Otherwise, it's the slow path for us (no success at this point).
4363 
4364   if (super_check_offset.is_register()) {
4365     local_jcc(Assembler::equal, *L_success);
4366     cmpl(super_check_offset.as_register(), sc_offset);
4367     if (L_failure == &L_fallthrough) {
4368       local_jcc(Assembler::equal, *L_slow_path);
4369     } else {
4370       local_jcc(Assembler::notEqual, *L_failure);
4371       final_jmp(*L_slow_path);
4372     }
4373   } else if (super_check_offset.as_constant() == sc_offset) {
4374     // Need a slow path; fast failure is impossible.
4375     if (L_slow_path == &L_fallthrough) {
4376       local_jcc(Assembler::equal, *L_success);
4377     } else {
4378       local_jcc(Assembler::notEqual, *L_slow_path);
4379       final_jmp(*L_success);
4380     }
4381   } else {
4382     // No slow path; it's a fast decision.
4383     if (L_failure == &L_fallthrough) {
4384       local_jcc(Assembler::equal, *L_success);
4385     } else {
4386       local_jcc(Assembler::notEqual, *L_failure);
4387       final_jmp(*L_success);
4388     }
4389   }
4390 
4391   bind(L_fallthrough);
4392 
4393 #undef local_jcc
4394 #undef final_jmp
4395 }
4396 
4397 
4398 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4399                                                    Register super_klass,
4400                                                    Register temp_reg,
4401                                                    Register temp2_reg,
4402                                                    Label* L_success,
4403                                                    Label* L_failure,
4404                                                    bool set_cond_codes) {
4405   assert_different_registers(sub_klass, super_klass, temp_reg);
4406   if (temp2_reg != noreg)
4407     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
4408 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
4409 
4410   Label L_fallthrough;
4411   int label_nulls = 0;
4412   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
4413   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
4414   assert(label_nulls <= 1, "at most one NULL in the batch");
4415 
4416   // a couple of useful fields in sub_klass:
4417   int ss_offset = in_bytes(Klass::secondary_supers_offset());
4418   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4419   Address secondary_supers_addr(sub_klass, ss_offset);
4420   Address super_cache_addr(     sub_klass, sc_offset);
4421 
4422   // Do a linear scan of the secondary super-klass chain.
4423   // This code is rarely used, so simplicity is a virtue here.
4424   // The repne_scan instruction uses fixed registers, which we must spill.
4425   // Don't worry too much about pre-existing connections with the input regs.
4426 
4427   assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
4428   assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
4429 
4430   // Get super_klass value into rax (even if it was in rdi or rcx).
4431   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
4432   if (super_klass != rax || UseCompressedOops) {
4433     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
4434     mov(rax, super_klass);
4435   }
4436   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
4437   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
4438 
4439 #ifndef PRODUCT
4440   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
4441   ExternalAddress pst_counter_addr((address) pst_counter);
4442   NOT_LP64(  incrementl(pst_counter_addr) );
4443   LP64_ONLY( lea(rcx, pst_counter_addr) );
4444   LP64_ONLY( incrementl(Address(rcx, 0)) );
4445 #endif //PRODUCT
4446 
4447   // We will consult the secondary-super array.
4448   movptr(rdi, secondary_supers_addr);
4449   // Load the array length.  (Positive movl does right thing on LP64.)
4450   movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
4451   // Skip to start of data.
4452   addptr(rdi, Array<Klass*>::base_offset_in_bytes());
4453 
4454   // Scan RCX words at [RDI] for an occurrence of RAX.
4455   // Set NZ/Z based on last compare.
4456   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
4457   // not change flags (only scas instruction which is repeated sets flags).
4458   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
4459 
4460     testptr(rax,rax); // Set Z = 0
4461     repne_scan();
4462 
4463   // Unspill the temp. registers:
4464   if (pushed_rdi)  pop(rdi);
4465   if (pushed_rcx)  pop(rcx);
4466   if (pushed_rax)  pop(rax);
4467 
4468   if (set_cond_codes) {
4469     // Special hack for the AD files:  rdi is guaranteed non-zero.
4470     assert(!pushed_rdi, "rdi must be left non-NULL");
4471     // Also, the condition codes are properly set Z/NZ on succeed/failure.
4472   }
4473 
4474   if (L_failure == &L_fallthrough)
4475         jccb(Assembler::notEqual, *L_failure);
4476   else  jcc(Assembler::notEqual, *L_failure);
4477 
4478   // Success.  Cache the super we found and proceed in triumph.
4479   movptr(super_cache_addr, super_klass);
4480 
4481   if (L_success != &L_fallthrough) {
4482     jmp(*L_success);
4483   }
4484 
4485 #undef IS_A_TEMP
4486 
4487   bind(L_fallthrough);
4488 }
4489 
4490 
4491 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
4492   if (VM_Version::supports_cmov()) {
4493     cmovl(cc, dst, src);
4494   } else {
4495     Label L;
4496     jccb(negate_condition(cc), L);
4497     movl(dst, src);
4498     bind(L);
4499   }
4500 }
4501 
4502 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
4503   if (VM_Version::supports_cmov()) {
4504     cmovl(cc, dst, src);
4505   } else {
4506     Label L;
4507     jccb(negate_condition(cc), L);
4508     movl(dst, src);
4509     bind(L);
4510   }
4511 }
4512 
4513 void MacroAssembler::verify_oop(Register reg, const char* s) {
4514   if (!VerifyOops) return;
4515 
4516   // Pass register number to verify_oop_subroutine
4517   const char* b = NULL;
4518   {
4519     ResourceMark rm;
4520     stringStream ss;
4521     ss.print("verify_oop: %s: %s", reg->name(), s);
4522     b = code_string(ss.as_string());
4523   }
4524   BLOCK_COMMENT("verify_oop {");
4525 #ifdef _LP64
4526   push(rscratch1);                    // save r10, trashed by movptr()
4527 #endif
4528   push(rax);                          // save rax,
4529   push(reg);                          // pass register argument
4530   ExternalAddress buffer((address) b);
4531   // avoid using pushptr, as it modifies scratch registers
4532   // and our contract is not to modify anything
4533   movptr(rax, buffer.addr());
4534   push(rax);
4535   // call indirectly to solve generation ordering problem
4536   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4537   call(rax);
4538   // Caller pops the arguments (oop, message) and restores rax, r10
4539   BLOCK_COMMENT("} verify_oop");
4540 }
4541 
4542 
4543 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
4544                                                       Register tmp,
4545                                                       int offset) {
4546   intptr_t value = *delayed_value_addr;
4547   if (value != 0)
4548     return RegisterOrConstant(value + offset);
4549 
4550   // load indirectly to solve generation ordering problem
4551   movptr(tmp, ExternalAddress((address) delayed_value_addr));
4552 
4553 #ifdef ASSERT
4554   { Label L;
4555     testptr(tmp, tmp);
4556     if (WizardMode) {
4557       const char* buf = NULL;
4558       {
4559         ResourceMark rm;
4560         stringStream ss;
4561         ss.print("DelayedValue=" INTPTR_FORMAT, delayed_value_addr[1]);
4562         buf = code_string(ss.as_string());
4563       }
4564       jcc(Assembler::notZero, L);
4565       STOP(buf);
4566     } else {
4567       jccb(Assembler::notZero, L);
4568       hlt();
4569     }
4570     bind(L);
4571   }
4572 #endif
4573 
4574   if (offset != 0)
4575     addptr(tmp, offset);
4576 
4577   return RegisterOrConstant(tmp);
4578 }
4579 
4580 
4581 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
4582                                          int extra_slot_offset) {
4583   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
4584   int stackElementSize = Interpreter::stackElementSize;
4585   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
4586 #ifdef ASSERT
4587   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
4588   assert(offset1 - offset == stackElementSize, "correct arithmetic");
4589 #endif
4590   Register             scale_reg    = noreg;
4591   Address::ScaleFactor scale_factor = Address::no_scale;
4592   if (arg_slot.is_constant()) {
4593     offset += arg_slot.as_constant() * stackElementSize;
4594   } else {
4595     scale_reg    = arg_slot.as_register();
4596     scale_factor = Address::times(stackElementSize);
4597   }
4598   offset += wordSize;           // return PC is on stack
4599   return Address(rsp, scale_reg, scale_factor, offset);
4600 }
4601 
4602 
4603 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
4604   if (!VerifyOops) return;
4605 
4606   // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
4607   // Pass register number to verify_oop_subroutine
4608   const char* b = NULL;
4609   {
4610     ResourceMark rm;
4611     stringStream ss;
4612     ss.print("verify_oop_addr: %s", s);
4613     b = code_string(ss.as_string());
4614   }
4615 #ifdef _LP64
4616   push(rscratch1);                    // save r10, trashed by movptr()
4617 #endif
4618   push(rax);                          // save rax,
4619   // addr may contain rsp so we will have to adjust it based on the push
4620   // we just did (and on 64 bit we do two pushes)
4621   // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
4622   // stores rax into addr which is backwards of what was intended.
4623   if (addr.uses(rsp)) {
4624     lea(rax, addr);
4625     pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
4626   } else {
4627     pushptr(addr);
4628   }
4629 
4630   ExternalAddress buffer((address) b);
4631   // pass msg argument
4632   // avoid using pushptr, as it modifies scratch registers
4633   // and our contract is not to modify anything
4634   movptr(rax, buffer.addr());
4635   push(rax);
4636 
4637   // call indirectly to solve generation ordering problem
4638   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4639   call(rax);
4640   // Caller pops the arguments (addr, message) and restores rax, r10.
4641 }
4642 
4643 void MacroAssembler::verify_tlab() {
4644 #ifdef ASSERT
4645   if (UseTLAB && VerifyOops) {
4646     Label next, ok;
4647     Register t1 = rsi;
4648     Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
4649 
4650     push(t1);
4651     NOT_LP64(push(thread_reg));
4652     NOT_LP64(get_thread(thread_reg));
4653 
4654     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4655     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
4656     jcc(Assembler::aboveEqual, next);
4657     STOP("assert(top >= start)");
4658     should_not_reach_here();
4659 
4660     bind(next);
4661     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
4662     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4663     jcc(Assembler::aboveEqual, ok);
4664     STOP("assert(top <= end)");
4665     should_not_reach_here();
4666 
4667     bind(ok);
4668     NOT_LP64(pop(thread_reg));
4669     pop(t1);
4670   }
4671 #endif
4672 }
4673 
4674 class ControlWord {
4675  public:
4676   int32_t _value;
4677 
4678   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
4679   int  precision_control() const       { return  (_value >>  8) & 3      ; }
4680   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4681   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4682   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4683   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4684   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4685   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4686 
4687   void print() const {
4688     // rounding control
4689     const char* rc;
4690     switch (rounding_control()) {
4691       case 0: rc = "round near"; break;
4692       case 1: rc = "round down"; break;
4693       case 2: rc = "round up  "; break;
4694       case 3: rc = "chop      "; break;
4695     };
4696     // precision control
4697     const char* pc;
4698     switch (precision_control()) {
4699       case 0: pc = "24 bits "; break;
4700       case 1: pc = "reserved"; break;
4701       case 2: pc = "53 bits "; break;
4702       case 3: pc = "64 bits "; break;
4703     };
4704     // flags
4705     char f[9];
4706     f[0] = ' ';
4707     f[1] = ' ';
4708     f[2] = (precision   ()) ? 'P' : 'p';
4709     f[3] = (underflow   ()) ? 'U' : 'u';
4710     f[4] = (overflow    ()) ? 'O' : 'o';
4711     f[5] = (zero_divide ()) ? 'Z' : 'z';
4712     f[6] = (denormalized()) ? 'D' : 'd';
4713     f[7] = (invalid     ()) ? 'I' : 'i';
4714     f[8] = '\x0';
4715     // output
4716     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
4717   }
4718 
4719 };
4720 
4721 class StatusWord {
4722  public:
4723   int32_t _value;
4724 
4725   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
4726   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
4727   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
4728   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
4729   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
4730   int  top() const                     { return  (_value >> 11) & 7      ; }
4731   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
4732   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
4733   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4734   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4735   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4736   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4737   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4738   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4739 
4740   void print() const {
4741     // condition codes
4742     char c[5];
4743     c[0] = (C3()) ? '3' : '-';
4744     c[1] = (C2()) ? '2' : '-';
4745     c[2] = (C1()) ? '1' : '-';
4746     c[3] = (C0()) ? '0' : '-';
4747     c[4] = '\x0';
4748     // flags
4749     char f[9];
4750     f[0] = (error_status()) ? 'E' : '-';
4751     f[1] = (stack_fault ()) ? 'S' : '-';
4752     f[2] = (precision   ()) ? 'P' : '-';
4753     f[3] = (underflow   ()) ? 'U' : '-';
4754     f[4] = (overflow    ()) ? 'O' : '-';
4755     f[5] = (zero_divide ()) ? 'Z' : '-';
4756     f[6] = (denormalized()) ? 'D' : '-';
4757     f[7] = (invalid     ()) ? 'I' : '-';
4758     f[8] = '\x0';
4759     // output
4760     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
4761   }
4762 
4763 };
4764 
4765 class TagWord {
4766  public:
4767   int32_t _value;
4768 
4769   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
4770 
4771   void print() const {
4772     printf("%04x", _value & 0xFFFF);
4773   }
4774 
4775 };
4776 
4777 class FPU_Register {
4778  public:
4779   int32_t _m0;
4780   int32_t _m1;
4781   int16_t _ex;
4782 
4783   bool is_indefinite() const           {
4784     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
4785   }
4786 
4787   void print() const {
4788     char  sign = (_ex < 0) ? '-' : '+';
4789     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
4790     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
4791   };
4792 
4793 };
4794 
4795 class FPU_State {
4796  public:
4797   enum {
4798     register_size       = 10,
4799     number_of_registers =  8,
4800     register_mask       =  7
4801   };
4802 
4803   ControlWord  _control_word;
4804   StatusWord   _status_word;
4805   TagWord      _tag_word;
4806   int32_t      _error_offset;
4807   int32_t      _error_selector;
4808   int32_t      _data_offset;
4809   int32_t      _data_selector;
4810   int8_t       _register[register_size * number_of_registers];
4811 
4812   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
4813   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
4814 
4815   const char* tag_as_string(int tag) const {
4816     switch (tag) {
4817       case 0: return "valid";
4818       case 1: return "zero";
4819       case 2: return "special";
4820       case 3: return "empty";
4821     }
4822     ShouldNotReachHere();
4823     return NULL;
4824   }
4825 
4826   void print() const {
4827     // print computation registers
4828     { int t = _status_word.top();
4829       for (int i = 0; i < number_of_registers; i++) {
4830         int j = (i - t) & register_mask;
4831         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
4832         st(j)->print();
4833         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
4834       }
4835     }
4836     printf("\n");
4837     // print control registers
4838     printf("ctrl = "); _control_word.print(); printf("\n");
4839     printf("stat = "); _status_word .print(); printf("\n");
4840     printf("tags = "); _tag_word    .print(); printf("\n");
4841   }
4842 
4843 };
4844 
4845 class Flag_Register {
4846  public:
4847   int32_t _value;
4848 
4849   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
4850   bool direction() const               { return ((_value >> 10) & 1) != 0; }
4851   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
4852   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
4853   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
4854   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
4855   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
4856 
4857   void print() const {
4858     // flags
4859     char f[8];
4860     f[0] = (overflow       ()) ? 'O' : '-';
4861     f[1] = (direction      ()) ? 'D' : '-';
4862     f[2] = (sign           ()) ? 'S' : '-';
4863     f[3] = (zero           ()) ? 'Z' : '-';
4864     f[4] = (auxiliary_carry()) ? 'A' : '-';
4865     f[5] = (parity         ()) ? 'P' : '-';
4866     f[6] = (carry          ()) ? 'C' : '-';
4867     f[7] = '\x0';
4868     // output
4869     printf("%08x  flags = %s", _value, f);
4870   }
4871 
4872 };
4873 
4874 class IU_Register {
4875  public:
4876   int32_t _value;
4877 
4878   void print() const {
4879     printf("%08x  %11d", _value, _value);
4880   }
4881 
4882 };
4883 
4884 class IU_State {
4885  public:
4886   Flag_Register _eflags;
4887   IU_Register   _rdi;
4888   IU_Register   _rsi;
4889   IU_Register   _rbp;
4890   IU_Register   _rsp;
4891   IU_Register   _rbx;
4892   IU_Register   _rdx;
4893   IU_Register   _rcx;
4894   IU_Register   _rax;
4895 
4896   void print() const {
4897     // computation registers
4898     printf("rax,  = "); _rax.print(); printf("\n");
4899     printf("rbx,  = "); _rbx.print(); printf("\n");
4900     printf("rcx  = "); _rcx.print(); printf("\n");
4901     printf("rdx  = "); _rdx.print(); printf("\n");
4902     printf("rdi  = "); _rdi.print(); printf("\n");
4903     printf("rsi  = "); _rsi.print(); printf("\n");
4904     printf("rbp,  = "); _rbp.print(); printf("\n");
4905     printf("rsp  = "); _rsp.print(); printf("\n");
4906     printf("\n");
4907     // control registers
4908     printf("flgs = "); _eflags.print(); printf("\n");
4909   }
4910 };
4911 
4912 
4913 class CPU_State {
4914  public:
4915   FPU_State _fpu_state;
4916   IU_State  _iu_state;
4917 
4918   void print() const {
4919     printf("--------------------------------------------------\n");
4920     _iu_state .print();
4921     printf("\n");
4922     _fpu_state.print();
4923     printf("--------------------------------------------------\n");
4924   }
4925 
4926 };
4927 
4928 
4929 static void _print_CPU_state(CPU_State* state) {
4930   state->print();
4931 };
4932 
4933 
4934 void MacroAssembler::print_CPU_state() {
4935   push_CPU_state();
4936   push(rsp);                // pass CPU state
4937   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
4938   addptr(rsp, wordSize);       // discard argument
4939   pop_CPU_state();
4940 }
4941 
4942 
4943 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
4944   static int counter = 0;
4945   FPU_State* fs = &state->_fpu_state;
4946   counter++;
4947   // For leaf calls, only verify that the top few elements remain empty.
4948   // We only need 1 empty at the top for C2 code.
4949   if( stack_depth < 0 ) {
4950     if( fs->tag_for_st(7) != 3 ) {
4951       printf("FPR7 not empty\n");
4952       state->print();
4953       assert(false, "error");
4954       return false;
4955     }
4956     return true;                // All other stack states do not matter
4957   }
4958 
4959   assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
4960          "bad FPU control word");
4961 
4962   // compute stack depth
4963   int i = 0;
4964   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
4965   int d = i;
4966   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
4967   // verify findings
4968   if (i != FPU_State::number_of_registers) {
4969     // stack not contiguous
4970     printf("%s: stack not contiguous at ST%d\n", s, i);
4971     state->print();
4972     assert(false, "error");
4973     return false;
4974   }
4975   // check if computed stack depth corresponds to expected stack depth
4976   if (stack_depth < 0) {
4977     // expected stack depth is -stack_depth or less
4978     if (d > -stack_depth) {
4979       // too many elements on the stack
4980       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
4981       state->print();
4982       assert(false, "error");
4983       return false;
4984     }
4985   } else {
4986     // expected stack depth is stack_depth
4987     if (d != stack_depth) {
4988       // wrong stack depth
4989       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
4990       state->print();
4991       assert(false, "error");
4992       return false;
4993     }
4994   }
4995   // everything is cool
4996   return true;
4997 }
4998 
4999 
5000 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
5001   if (!VerifyFPU) return;
5002   push_CPU_state();
5003   push(rsp);                // pass CPU state
5004   ExternalAddress msg((address) s);
5005   // pass message string s
5006   pushptr(msg.addr());
5007   push(stack_depth);        // pass stack depth
5008   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
5009   addptr(rsp, 3 * wordSize);   // discard arguments
5010   // check for error
5011   { Label L;
5012     testl(rax, rax);
5013     jcc(Assembler::notZero, L);
5014     int3();                  // break if error condition
5015     bind(L);
5016   }
5017   pop_CPU_state();
5018 }
5019 
5020 void MacroAssembler::restore_cpu_control_state_after_jni() {
5021   // Either restore the MXCSR register after returning from the JNI Call
5022   // or verify that it wasn't changed (with -Xcheck:jni flag).
5023   if (VM_Version::supports_sse()) {
5024     if (RestoreMXCSROnJNICalls) {
5025       ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
5026     } else if (CheckJNICalls) {
5027       call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
5028     }
5029   }
5030   // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
5031   vzeroupper();
5032   // Reset k1 to 0xffff.
5033 
5034 #ifdef COMPILER2
5035   if (PostLoopMultiversioning && VM_Version::supports_evex()) {
5036     push(rcx);
5037     movl(rcx, 0xffff);
5038     kmovwl(k1, rcx);
5039     pop(rcx);
5040   }
5041 #endif // COMPILER2
5042 
5043 #ifndef _LP64
5044   // Either restore the x87 floating pointer control word after returning
5045   // from the JNI call or verify that it wasn't changed.
5046   if (CheckJNICalls) {
5047     call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
5048   }
5049 #endif // _LP64
5050 }
5051 
5052 // ((OopHandle)result).resolve();
5053 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
5054   assert_different_registers(result, tmp);
5055 
5056   // Only 64 bit platforms support GCs that require a tmp register
5057   // Only IN_HEAP loads require a thread_tmp register
5058   // OopHandle::resolve is an indirection like jobject.
5059   access_load_at(T_OBJECT, IN_NATIVE,
5060                  result, Address(result, 0), tmp, /*tmp_thread*/noreg);
5061 }
5062 
5063 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
5064   // get mirror
5065   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
5066   movptr(mirror, Address(method, Method::const_offset()));
5067   movptr(mirror, Address(mirror, ConstMethod::constants_offset()));
5068   movptr(mirror, Address(mirror, ConstantPool::pool_holder_offset_in_bytes()));
5069   movptr(mirror, Address(mirror, mirror_offset));
5070   resolve_oop_handle(mirror, tmp);
5071 }
5072 
5073 void MacroAssembler::load_klass(Register dst, Register src) {
5074 #ifdef _LP64
5075   if (UseCompressedClassPointers) {
5076     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5077     decode_klass_not_null(dst);
5078   } else
5079 #endif
5080     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5081 }
5082 
5083 void MacroAssembler::load_prototype_header(Register dst, Register src) {
5084   load_klass(dst, src);
5085   movptr(dst, Address(dst, Klass::prototype_header_offset()));
5086 }
5087 
5088 void MacroAssembler::store_klass(Register dst, Register src) {
5089 #ifdef _LP64
5090   if (UseCompressedClassPointers) {
5091     encode_klass_not_null(src);
5092     movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5093   } else
5094 #endif
5095     movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5096 }
5097 
5098 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
5099                                     Register tmp1, Register thread_tmp) {
5100   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5101   decorators = AccessInternal::decorator_fixup(decorators);
5102   bool as_raw = (decorators & AS_RAW) != 0;
5103   if (as_raw) {
5104     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
5105   } else {
5106     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
5107   }
5108 }
5109 
5110 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src,
5111                                      Register tmp1, Register tmp2) {
5112   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5113   decorators = AccessInternal::decorator_fixup(decorators);
5114   bool as_raw = (decorators & AS_RAW) != 0;
5115   if (as_raw) {
5116     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, tmp2);
5117   } else {
5118     bs->store_at(this, decorators, type, dst, src, tmp1, tmp2);
5119   }
5120 }
5121 
5122 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
5123   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
5124   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
5125     decorators |= ACCESS_READ | ACCESS_WRITE;
5126   }
5127   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5128   return bs->resolve(this, decorators, obj);
5129 }
5130 
5131 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
5132                                    Register thread_tmp, DecoratorSet decorators) {
5133   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
5134 }
5135 
5136 // Doesn't do verfication, generates fixed size code
5137 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
5138                                             Register thread_tmp, DecoratorSet decorators) {
5139   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
5140 }
5141 
5142 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
5143                                     Register tmp2, DecoratorSet decorators) {
5144   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
5145 }
5146 
5147 // Used for storing NULLs.
5148 void MacroAssembler::store_heap_oop_null(Address dst) {
5149   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
5150 }
5151 
5152 #ifdef _LP64
5153 void MacroAssembler::store_klass_gap(Register dst, Register src) {
5154   if (UseCompressedClassPointers) {
5155     // Store to klass gap in destination
5156     movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
5157   }
5158 }
5159 
5160 #ifdef ASSERT
5161 void MacroAssembler::verify_heapbase(const char* msg) {
5162   assert (UseCompressedOops, "should be compressed");
5163   assert (Universe::heap() != NULL, "java heap should be initialized");
5164   if (CheckCompressedOops) {
5165     Label ok;
5166     push(rscratch1); // cmpptr trashes rscratch1
5167     cmpptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
5168     jcc(Assembler::equal, ok);
5169     STOP(msg);
5170     bind(ok);
5171     pop(rscratch1);
5172   }
5173 }
5174 #endif
5175 
5176 // Algorithm must match oop.inline.hpp encode_heap_oop.
5177 void MacroAssembler::encode_heap_oop(Register r) {
5178 #ifdef ASSERT
5179   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
5180 #endif
5181   verify_oop(r, "broken oop in encode_heap_oop");
5182   if (Universe::narrow_oop_base() == NULL) {
5183     if (Universe::narrow_oop_shift() != 0) {
5184       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5185       shrq(r, LogMinObjAlignmentInBytes);
5186     }
5187     return;
5188   }
5189   testq(r, r);
5190   cmovq(Assembler::equal, r, r12_heapbase);
5191   subq(r, r12_heapbase);
5192   shrq(r, LogMinObjAlignmentInBytes);
5193 }
5194 
5195 void MacroAssembler::encode_heap_oop_not_null(Register r) {
5196 #ifdef ASSERT
5197   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
5198   if (CheckCompressedOops) {
5199     Label ok;
5200     testq(r, r);
5201     jcc(Assembler::notEqual, ok);
5202     STOP("null oop passed to encode_heap_oop_not_null");
5203     bind(ok);
5204   }
5205 #endif
5206   verify_oop(r, "broken oop in encode_heap_oop_not_null");
5207   if (Universe::narrow_oop_base() != NULL) {
5208     subq(r, r12_heapbase);
5209   }
5210   if (Universe::narrow_oop_shift() != 0) {
5211     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5212     shrq(r, LogMinObjAlignmentInBytes);
5213   }
5214 }
5215 
5216 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
5217 #ifdef ASSERT
5218   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
5219   if (CheckCompressedOops) {
5220     Label ok;
5221     testq(src, src);
5222     jcc(Assembler::notEqual, ok);
5223     STOP("null oop passed to encode_heap_oop_not_null2");
5224     bind(ok);
5225   }
5226 #endif
5227   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
5228   if (dst != src) {
5229     movq(dst, src);
5230   }
5231   if (Universe::narrow_oop_base() != NULL) {
5232     subq(dst, r12_heapbase);
5233   }
5234   if (Universe::narrow_oop_shift() != 0) {
5235     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5236     shrq(dst, LogMinObjAlignmentInBytes);
5237   }
5238 }
5239 
5240 void  MacroAssembler::decode_heap_oop(Register r) {
5241 #ifdef ASSERT
5242   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
5243 #endif
5244   if (Universe::narrow_oop_base() == NULL) {
5245     if (Universe::narrow_oop_shift() != 0) {
5246       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5247       shlq(r, LogMinObjAlignmentInBytes);
5248     }
5249   } else {
5250     Label done;
5251     shlq(r, LogMinObjAlignmentInBytes);
5252     jccb(Assembler::equal, done);
5253     addq(r, r12_heapbase);
5254     bind(done);
5255   }
5256   verify_oop(r, "broken oop in decode_heap_oop");
5257 }
5258 
5259 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
5260   // Note: it will change flags
5261   assert (UseCompressedOops, "should only be used for compressed headers");
5262   assert (Universe::heap() != NULL, "java heap should be initialized");
5263   // Cannot assert, unverified entry point counts instructions (see .ad file)
5264   // vtableStubs also counts instructions in pd_code_size_limit.
5265   // Also do not verify_oop as this is called by verify_oop.
5266   if (Universe::narrow_oop_shift() != 0) {
5267     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5268     shlq(r, LogMinObjAlignmentInBytes);
5269     if (Universe::narrow_oop_base() != NULL) {
5270       addq(r, r12_heapbase);
5271     }
5272   } else {
5273     assert (Universe::narrow_oop_base() == NULL, "sanity");
5274   }
5275 }
5276 
5277 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
5278   // Note: it will change flags
5279   assert (UseCompressedOops, "should only be used for compressed headers");
5280   assert (Universe::heap() != NULL, "java heap should be initialized");
5281   // Cannot assert, unverified entry point counts instructions (see .ad file)
5282   // vtableStubs also counts instructions in pd_code_size_limit.
5283   // Also do not verify_oop as this is called by verify_oop.
5284   if (Universe::narrow_oop_shift() != 0) {
5285     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5286     if (LogMinObjAlignmentInBytes == Address::times_8) {
5287       leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
5288     } else {
5289       if (dst != src) {
5290         movq(dst, src);
5291       }
5292       shlq(dst, LogMinObjAlignmentInBytes);
5293       if (Universe::narrow_oop_base() != NULL) {
5294         addq(dst, r12_heapbase);
5295       }
5296     }
5297   } else {
5298     assert (Universe::narrow_oop_base() == NULL, "sanity");
5299     if (dst != src) {
5300       movq(dst, src);
5301     }
5302   }
5303 }
5304 
5305 void MacroAssembler::encode_klass_not_null(Register r) {
5306   if (Universe::narrow_klass_base() != NULL) {
5307     // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
5308     assert(r != r12_heapbase, "Encoding a klass in r12");
5309     mov64(r12_heapbase, (int64_t)Universe::narrow_klass_base());
5310     subq(r, r12_heapbase);
5311   }
5312   if (Universe::narrow_klass_shift() != 0) {
5313     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
5314     shrq(r, LogKlassAlignmentInBytes);
5315   }
5316   if (Universe::narrow_klass_base() != NULL) {
5317     reinit_heapbase();
5318   }
5319 }
5320 
5321 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
5322   if (dst == src) {
5323     encode_klass_not_null(src);
5324   } else {
5325     if (Universe::narrow_klass_base() != NULL) {
5326       mov64(dst, (int64_t)Universe::narrow_klass_base());
5327       negq(dst);
5328       addq(dst, src);
5329     } else {
5330       movptr(dst, src);
5331     }
5332     if (Universe::narrow_klass_shift() != 0) {
5333       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
5334       shrq(dst, LogKlassAlignmentInBytes);
5335     }
5336   }
5337 }
5338 
5339 // Function instr_size_for_decode_klass_not_null() counts the instructions
5340 // generated by decode_klass_not_null(register r) and reinit_heapbase(),
5341 // when (Universe::heap() != NULL).  Hence, if the instructions they
5342 // generate change, then this method needs to be updated.
5343 int MacroAssembler::instr_size_for_decode_klass_not_null() {
5344   assert (UseCompressedClassPointers, "only for compressed klass ptrs");
5345   if (Universe::narrow_klass_base() != NULL) {
5346     // mov64 + addq + shlq? + mov64  (for reinit_heapbase()).
5347     return (Universe::narrow_klass_shift() == 0 ? 20 : 24);
5348   } else {
5349     // longest load decode klass function, mov64, leaq
5350     return 16;
5351   }
5352 }
5353 
5354 // !!! If the instructions that get generated here change then function
5355 // instr_size_for_decode_klass_not_null() needs to get updated.
5356 void  MacroAssembler::decode_klass_not_null(Register r) {
5357   // Note: it will change flags
5358   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5359   assert(r != r12_heapbase, "Decoding a klass in r12");
5360   // Cannot assert, unverified entry point counts instructions (see .ad file)
5361   // vtableStubs also counts instructions in pd_code_size_limit.
5362   // Also do not verify_oop as this is called by verify_oop.
5363   if (Universe::narrow_klass_shift() != 0) {
5364     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
5365     shlq(r, LogKlassAlignmentInBytes);
5366   }
5367   // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
5368   if (Universe::narrow_klass_base() != NULL) {
5369     mov64(r12_heapbase, (int64_t)Universe::narrow_klass_base());
5370     addq(r, r12_heapbase);
5371     reinit_heapbase();
5372   }
5373 }
5374 
5375 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
5376   // Note: it will change flags
5377   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5378   if (dst == src) {
5379     decode_klass_not_null(dst);
5380   } else {
5381     // Cannot assert, unverified entry point counts instructions (see .ad file)
5382     // vtableStubs also counts instructions in pd_code_size_limit.
5383     // Also do not verify_oop as this is called by verify_oop.
5384     mov64(dst, (int64_t)Universe::narrow_klass_base());
5385     if (Universe::narrow_klass_shift() != 0) {
5386       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
5387       assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
5388       leaq(dst, Address(dst, src, Address::times_8, 0));
5389     } else {
5390       addq(dst, src);
5391     }
5392   }
5393 }
5394 
5395 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5396   assert (UseCompressedOops, "should only be used for compressed headers");
5397   assert (Universe::heap() != NULL, "java heap should be initialized");
5398   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5399   int oop_index = oop_recorder()->find_index(obj);
5400   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5401   mov_narrow_oop(dst, oop_index, rspec);
5402 }
5403 
5404 void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
5405   assert (UseCompressedOops, "should only be used for compressed headers");
5406   assert (Universe::heap() != NULL, "java heap should be initialized");
5407   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5408   int oop_index = oop_recorder()->find_index(obj);
5409   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5410   mov_narrow_oop(dst, oop_index, rspec);
5411 }
5412 
5413 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5414   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5415   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5416   int klass_index = oop_recorder()->find_index(k);
5417   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5418   mov_narrow_oop(dst, Klass::encode_klass(k), rspec);
5419 }
5420 
5421 void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
5422   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5423   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5424   int klass_index = oop_recorder()->find_index(k);
5425   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5426   mov_narrow_oop(dst, Klass::encode_klass(k), rspec);
5427 }
5428 
5429 void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
5430   assert (UseCompressedOops, "should only be used for compressed headers");
5431   assert (Universe::heap() != NULL, "java heap should be initialized");
5432   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5433   int oop_index = oop_recorder()->find_index(obj);
5434   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5435   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5436 }
5437 
5438 void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
5439   assert (UseCompressedOops, "should only be used for compressed headers");
5440   assert (Universe::heap() != NULL, "java heap should be initialized");
5441   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5442   int oop_index = oop_recorder()->find_index(obj);
5443   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5444   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5445 }
5446 
5447 void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
5448   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5449   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5450   int klass_index = oop_recorder()->find_index(k);
5451   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5452   Assembler::cmp_narrow_oop(dst, Klass::encode_klass(k), rspec);
5453 }
5454 
5455 void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
5456   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5457   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5458   int klass_index = oop_recorder()->find_index(k);
5459   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5460   Assembler::cmp_narrow_oop(dst, Klass::encode_klass(k), rspec);
5461 }
5462 
5463 void MacroAssembler::reinit_heapbase() {
5464   if (UseCompressedOops || UseCompressedClassPointers) {
5465     if (Universe::heap() != NULL) {
5466       if (Universe::narrow_oop_base() == NULL) {
5467         MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
5468       } else {
5469         mov64(r12_heapbase, (int64_t)Universe::narrow_ptrs_base());
5470       }
5471     } else {
5472       movptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
5473     }
5474   }
5475 }
5476 
5477 #endif // _LP64
5478 
5479 // C2 compiled method's prolog code.
5480 void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b) {
5481 
5482   // WARNING: Initial instruction MUST be 5 bytes or longer so that
5483   // NativeJump::patch_verified_entry will be able to patch out the entry
5484   // code safely. The push to verify stack depth is ok at 5 bytes,
5485   // the frame allocation can be either 3 or 6 bytes. So if we don't do
5486   // stack bang then we must use the 6 byte frame allocation even if
5487   // we have no frame. :-(
5488   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
5489 
5490   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
5491   // Remove word for return addr
5492   framesize -= wordSize;
5493   stack_bang_size -= wordSize;
5494 
5495   // Calls to C2R adapters often do not accept exceptional returns.
5496   // We require that their callers must bang for them.  But be careful, because
5497   // some VM calls (such as call site linkage) can use several kilobytes of
5498   // stack.  But the stack safety zone should account for that.
5499   // See bugs 4446381, 4468289, 4497237.
5500   if (stack_bang_size > 0) {
5501     generate_stack_overflow_check(stack_bang_size);
5502 
5503     // We always push rbp, so that on return to interpreter rbp, will be
5504     // restored correctly and we can correct the stack.
5505     push(rbp);
5506     // Save caller's stack pointer into RBP if the frame pointer is preserved.
5507     if (PreserveFramePointer) {
5508       mov(rbp, rsp);
5509     }
5510     // Remove word for ebp
5511     framesize -= wordSize;
5512 
5513     // Create frame
5514     if (framesize) {
5515       subptr(rsp, framesize);
5516     }
5517   } else {
5518     // Create frame (force generation of a 4 byte immediate value)
5519     subptr_imm32(rsp, framesize);
5520 
5521     // Save RBP register now.
5522     framesize -= wordSize;
5523     movptr(Address(rsp, framesize), rbp);
5524     // Save caller's stack pointer into RBP if the frame pointer is preserved.
5525     if (PreserveFramePointer) {
5526       movptr(rbp, rsp);
5527       if (framesize > 0) {
5528         addptr(rbp, framesize);
5529       }
5530     }
5531   }
5532 
5533   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
5534     framesize -= wordSize;
5535     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
5536   }
5537 
5538 #ifndef _LP64
5539   // If method sets FPU control word do it now
5540   if (fp_mode_24b) {
5541     fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
5542   }
5543   if (UseSSE >= 2 && VerifyFPU) {
5544     verify_FPU(0, "FPU stack must be clean on entry");
5545   }
5546 #endif
5547 
5548 #ifdef ASSERT
5549   if (VerifyStackAtCalls) {
5550     Label L;
5551     push(rax);
5552     mov(rax, rsp);
5553     andptr(rax, StackAlignmentInBytes-1);
5554     cmpptr(rax, StackAlignmentInBytes-wordSize);
5555     pop(rax);
5556     jcc(Assembler::equal, L);
5557     STOP("Stack is not properly aligned!");
5558     bind(L);
5559   }
5560 #endif
5561 
5562 }
5563 
5564 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers
5565 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, XMMRegister xtmp) {
5566   // cnt - number of qwords (8-byte words).
5567   // base - start address, qword aligned.
5568   Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
5569   if (UseAVX >= 2) {
5570     vpxor(xtmp, xtmp, xtmp, AVX_256bit);
5571   } else {
5572     pxor(xtmp, xtmp);
5573   }
5574   jmp(L_zero_64_bytes);
5575 
5576   BIND(L_loop);
5577   if (UseAVX >= 2) {
5578     vmovdqu(Address(base,  0), xtmp);
5579     vmovdqu(Address(base, 32), xtmp);
5580   } else {
5581     movdqu(Address(base,  0), xtmp);
5582     movdqu(Address(base, 16), xtmp);
5583     movdqu(Address(base, 32), xtmp);
5584     movdqu(Address(base, 48), xtmp);
5585   }
5586   addptr(base, 64);
5587 
5588   BIND(L_zero_64_bytes);
5589   subptr(cnt, 8);
5590   jccb(Assembler::greaterEqual, L_loop);
5591   addptr(cnt, 4);
5592   jccb(Assembler::less, L_tail);
5593   // Copy trailing 32 bytes
5594   if (UseAVX >= 2) {
5595     vmovdqu(Address(base, 0), xtmp);
5596   } else {
5597     movdqu(Address(base,  0), xtmp);
5598     movdqu(Address(base, 16), xtmp);
5599   }
5600   addptr(base, 32);
5601   subptr(cnt, 4);
5602 
5603   BIND(L_tail);
5604   addptr(cnt, 4);
5605   jccb(Assembler::lessEqual, L_end);
5606   decrement(cnt);
5607 
5608   BIND(L_sloop);
5609   movq(Address(base, 0), xtmp);
5610   addptr(base, 8);
5611   decrement(cnt);
5612   jccb(Assembler::greaterEqual, L_sloop);
5613   BIND(L_end);
5614 }
5615 
5616 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp, bool is_large) {
5617   // cnt - number of qwords (8-byte words).
5618   // base - start address, qword aligned.
5619   // is_large - if optimizers know cnt is larger than InitArrayShortSize
5620   assert(base==rdi, "base register must be edi for rep stos");
5621   assert(tmp==rax,   "tmp register must be eax for rep stos");
5622   assert(cnt==rcx,   "cnt register must be ecx for rep stos");
5623   assert(InitArrayShortSize % BytesPerLong == 0,
5624     "InitArrayShortSize should be the multiple of BytesPerLong");
5625 
5626   Label DONE;
5627 
5628   if (!is_large || !UseXMMForObjInit) {
5629     xorptr(tmp, tmp);
5630   }
5631 
5632   if (!is_large) {
5633     Label LOOP, LONG;
5634     cmpptr(cnt, InitArrayShortSize/BytesPerLong);
5635     jccb(Assembler::greater, LONG);
5636 
5637     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5638 
5639     decrement(cnt);
5640     jccb(Assembler::negative, DONE); // Zero length
5641 
5642     // Use individual pointer-sized stores for small counts:
5643     BIND(LOOP);
5644     movptr(Address(base, cnt, Address::times_ptr), tmp);
5645     decrement(cnt);
5646     jccb(Assembler::greaterEqual, LOOP);
5647     jmpb(DONE);
5648 
5649     BIND(LONG);
5650   }
5651 
5652   // Use longer rep-prefixed ops for non-small counts:
5653   if (UseFastStosb) {
5654     shlptr(cnt, 3); // convert to number of bytes
5655     rep_stosb();
5656   } else if (UseXMMForObjInit) {
5657     movptr(tmp, base);
5658     xmm_clear_mem(tmp, cnt, xtmp);
5659   } else {
5660     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5661     rep_stos();
5662   }
5663 
5664   BIND(DONE);
5665 }
5666 
5667 #ifdef COMPILER2
5668 
5669 // IndexOf for constant substrings with size >= 8 chars
5670 // which don't need to be loaded through stack.
5671 void MacroAssembler::string_indexofC8(Register str1, Register str2,
5672                                       Register cnt1, Register cnt2,
5673                                       int int_cnt2,  Register result,
5674                                       XMMRegister vec, Register tmp,
5675                                       int ae) {
5676   ShortBranchVerifier sbv(this);
5677   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
5678   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
5679 
5680   // This method uses the pcmpestri instruction with bound registers
5681   //   inputs:
5682   //     xmm - substring
5683   //     rax - substring length (elements count)
5684   //     mem - scanned string
5685   //     rdx - string length (elements count)
5686   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
5687   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
5688   //   outputs:
5689   //     rcx - matched index in string
5690   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
5691   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
5692   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
5693   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
5694   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
5695 
5696   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
5697         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
5698         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
5699 
5700   // Note, inline_string_indexOf() generates checks:
5701   // if (substr.count > string.count) return -1;
5702   // if (substr.count == 0) return 0;
5703   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
5704 
5705   // Load substring.
5706   if (ae == StrIntrinsicNode::UL) {
5707     pmovzxbw(vec, Address(str2, 0));
5708   } else {
5709     movdqu(vec, Address(str2, 0));
5710   }
5711   movl(cnt2, int_cnt2);
5712   movptr(result, str1); // string addr
5713 
5714   if (int_cnt2 > stride) {
5715     jmpb(SCAN_TO_SUBSTR);
5716 
5717     // Reload substr for rescan, this code
5718     // is executed only for large substrings (> 8 chars)
5719     bind(RELOAD_SUBSTR);
5720     if (ae == StrIntrinsicNode::UL) {
5721       pmovzxbw(vec, Address(str2, 0));
5722     } else {
5723       movdqu(vec, Address(str2, 0));
5724     }
5725     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
5726 
5727     bind(RELOAD_STR);
5728     // We came here after the beginning of the substring was
5729     // matched but the rest of it was not so we need to search
5730     // again. Start from the next element after the previous match.
5731 
5732     // cnt2 is number of substring reminding elements and
5733     // cnt1 is number of string reminding elements when cmp failed.
5734     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
5735     subl(cnt1, cnt2);
5736     addl(cnt1, int_cnt2);
5737     movl(cnt2, int_cnt2); // Now restore cnt2
5738 
5739     decrementl(cnt1);     // Shift to next element
5740     cmpl(cnt1, cnt2);
5741     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
5742 
5743     addptr(result, (1<<scale1));
5744 
5745   } // (int_cnt2 > 8)
5746 
5747   // Scan string for start of substr in 16-byte vectors
5748   bind(SCAN_TO_SUBSTR);
5749   pcmpestri(vec, Address(result, 0), mode);
5750   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
5751   subl(cnt1, stride);
5752   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
5753   cmpl(cnt1, cnt2);
5754   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
5755   addptr(result, 16);
5756   jmpb(SCAN_TO_SUBSTR);
5757 
5758   // Found a potential substr
5759   bind(FOUND_CANDIDATE);
5760   // Matched whole vector if first element matched (tmp(rcx) == 0).
5761   if (int_cnt2 == stride) {
5762     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
5763   } else { // int_cnt2 > 8
5764     jccb(Assembler::overflow, FOUND_SUBSTR);
5765   }
5766   // After pcmpestri tmp(rcx) contains matched element index
5767   // Compute start addr of substr
5768   lea(result, Address(result, tmp, scale1));
5769 
5770   // Make sure string is still long enough
5771   subl(cnt1, tmp);
5772   cmpl(cnt1, cnt2);
5773   if (int_cnt2 == stride) {
5774     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
5775   } else { // int_cnt2 > 8
5776     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
5777   }
5778   // Left less then substring.
5779 
5780   bind(RET_NOT_FOUND);
5781   movl(result, -1);
5782   jmp(EXIT);
5783 
5784   if (int_cnt2 > stride) {
5785     // This code is optimized for the case when whole substring
5786     // is matched if its head is matched.
5787     bind(MATCH_SUBSTR_HEAD);
5788     pcmpestri(vec, Address(result, 0), mode);
5789     // Reload only string if does not match
5790     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
5791 
5792     Label CONT_SCAN_SUBSTR;
5793     // Compare the rest of substring (> 8 chars).
5794     bind(FOUND_SUBSTR);
5795     // First 8 chars are already matched.
5796     negptr(cnt2);
5797     addptr(cnt2, stride);
5798 
5799     bind(SCAN_SUBSTR);
5800     subl(cnt1, stride);
5801     cmpl(cnt2, -stride); // Do not read beyond substring
5802     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
5803     // Back-up strings to avoid reading beyond substring:
5804     // cnt1 = cnt1 - cnt2 + 8
5805     addl(cnt1, cnt2); // cnt2 is negative
5806     addl(cnt1, stride);
5807     movl(cnt2, stride); negptr(cnt2);
5808     bind(CONT_SCAN_SUBSTR);
5809     if (int_cnt2 < (int)G) {
5810       int tail_off1 = int_cnt2<<scale1;
5811       int tail_off2 = int_cnt2<<scale2;
5812       if (ae == StrIntrinsicNode::UL) {
5813         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
5814       } else {
5815         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
5816       }
5817       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
5818     } else {
5819       // calculate index in register to avoid integer overflow (int_cnt2*2)
5820       movl(tmp, int_cnt2);
5821       addptr(tmp, cnt2);
5822       if (ae == StrIntrinsicNode::UL) {
5823         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
5824       } else {
5825         movdqu(vec, Address(str2, tmp, scale2, 0));
5826       }
5827       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
5828     }
5829     // Need to reload strings pointers if not matched whole vector
5830     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
5831     addptr(cnt2, stride);
5832     jcc(Assembler::negative, SCAN_SUBSTR);
5833     // Fall through if found full substring
5834 
5835   } // (int_cnt2 > 8)
5836 
5837   bind(RET_FOUND);
5838   // Found result if we matched full small substring.
5839   // Compute substr offset
5840   subptr(result, str1);
5841   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
5842     shrl(result, 1); // index
5843   }
5844   bind(EXIT);
5845 
5846 } // string_indexofC8
5847 
5848 // Small strings are loaded through stack if they cross page boundary.
5849 void MacroAssembler::string_indexof(Register str1, Register str2,
5850                                     Register cnt1, Register cnt2,
5851                                     int int_cnt2,  Register result,
5852                                     XMMRegister vec, Register tmp,
5853                                     int ae) {
5854   ShortBranchVerifier sbv(this);
5855   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
5856   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
5857 
5858   //
5859   // int_cnt2 is length of small (< 8 chars) constant substring
5860   // or (-1) for non constant substring in which case its length
5861   // is in cnt2 register.
5862   //
5863   // Note, inline_string_indexOf() generates checks:
5864   // if (substr.count > string.count) return -1;
5865   // if (substr.count == 0) return 0;
5866   //
5867   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
5868   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
5869   // This method uses the pcmpestri instruction with bound registers
5870   //   inputs:
5871   //     xmm - substring
5872   //     rax - substring length (elements count)
5873   //     mem - scanned string
5874   //     rdx - string length (elements count)
5875   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
5876   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
5877   //   outputs:
5878   //     rcx - matched index in string
5879   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
5880   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
5881   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
5882   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
5883 
5884   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
5885         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
5886         FOUND_CANDIDATE;
5887 
5888   { //========================================================
5889     // We don't know where these strings are located
5890     // and we can't read beyond them. Load them through stack.
5891     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
5892 
5893     movptr(tmp, rsp); // save old SP
5894 
5895     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
5896       if (int_cnt2 == (1>>scale2)) { // One byte
5897         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
5898         load_unsigned_byte(result, Address(str2, 0));
5899         movdl(vec, result); // move 32 bits
5900       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
5901         // Not enough header space in 32-bit VM: 12+3 = 15.
5902         movl(result, Address(str2, -1));
5903         shrl(result, 8);
5904         movdl(vec, result); // move 32 bits
5905       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
5906         load_unsigned_short(result, Address(str2, 0));
5907         movdl(vec, result); // move 32 bits
5908       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
5909         movdl(vec, Address(str2, 0)); // move 32 bits
5910       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
5911         movq(vec, Address(str2, 0));  // move 64 bits
5912       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
5913         // Array header size is 12 bytes in 32-bit VM
5914         // + 6 bytes for 3 chars == 18 bytes,
5915         // enough space to load vec and shift.
5916         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
5917         if (ae == StrIntrinsicNode::UL) {
5918           int tail_off = int_cnt2-8;
5919           pmovzxbw(vec, Address(str2, tail_off));
5920           psrldq(vec, -2*tail_off);
5921         }
5922         else {
5923           int tail_off = int_cnt2*(1<<scale2);
5924           movdqu(vec, Address(str2, tail_off-16));
5925           psrldq(vec, 16-tail_off);
5926         }
5927       }
5928     } else { // not constant substring
5929       cmpl(cnt2, stride);
5930       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
5931 
5932       // We can read beyond string if srt+16 does not cross page boundary
5933       // since heaps are aligned and mapped by pages.
5934       assert(os::vm_page_size() < (int)G, "default page should be small");
5935       movl(result, str2); // We need only low 32 bits
5936       andl(result, (os::vm_page_size()-1));
5937       cmpl(result, (os::vm_page_size()-16));
5938       jccb(Assembler::belowEqual, CHECK_STR);
5939 
5940       // Move small strings to stack to allow load 16 bytes into vec.
5941       subptr(rsp, 16);
5942       int stk_offset = wordSize-(1<<scale2);
5943       push(cnt2);
5944 
5945       bind(COPY_SUBSTR);
5946       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
5947         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
5948         movb(Address(rsp, cnt2, scale2, stk_offset), result);
5949       } else if (ae == StrIntrinsicNode::UU) {
5950         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
5951         movw(Address(rsp, cnt2, scale2, stk_offset), result);
5952       }
5953       decrement(cnt2);
5954       jccb(Assembler::notZero, COPY_SUBSTR);
5955 
5956       pop(cnt2);
5957       movptr(str2, rsp);  // New substring address
5958     } // non constant
5959 
5960     bind(CHECK_STR);
5961     cmpl(cnt1, stride);
5962     jccb(Assembler::aboveEqual, BIG_STRINGS);
5963 
5964     // Check cross page boundary.
5965     movl(result, str1); // We need only low 32 bits
5966     andl(result, (os::vm_page_size()-1));
5967     cmpl(result, (os::vm_page_size()-16));
5968     jccb(Assembler::belowEqual, BIG_STRINGS);
5969 
5970     subptr(rsp, 16);
5971     int stk_offset = -(1<<scale1);
5972     if (int_cnt2 < 0) { // not constant
5973       push(cnt2);
5974       stk_offset += wordSize;
5975     }
5976     movl(cnt2, cnt1);
5977 
5978     bind(COPY_STR);
5979     if (ae == StrIntrinsicNode::LL) {
5980       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
5981       movb(Address(rsp, cnt2, scale1, stk_offset), result);
5982     } else {
5983       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
5984       movw(Address(rsp, cnt2, scale1, stk_offset), result);
5985     }
5986     decrement(cnt2);
5987     jccb(Assembler::notZero, COPY_STR);
5988 
5989     if (int_cnt2 < 0) { // not constant
5990       pop(cnt2);
5991     }
5992     movptr(str1, rsp);  // New string address
5993 
5994     bind(BIG_STRINGS);
5995     // Load substring.
5996     if (int_cnt2 < 0) { // -1
5997       if (ae == StrIntrinsicNode::UL) {
5998         pmovzxbw(vec, Address(str2, 0));
5999       } else {
6000         movdqu(vec, Address(str2, 0));
6001       }
6002       push(cnt2);       // substr count
6003       push(str2);       // substr addr
6004       push(str1);       // string addr
6005     } else {
6006       // Small (< 8 chars) constant substrings are loaded already.
6007       movl(cnt2, int_cnt2);
6008     }
6009     push(tmp);  // original SP
6010 
6011   } // Finished loading
6012 
6013   //========================================================
6014   // Start search
6015   //
6016 
6017   movptr(result, str1); // string addr
6018 
6019   if (int_cnt2  < 0) {  // Only for non constant substring
6020     jmpb(SCAN_TO_SUBSTR);
6021 
6022     // SP saved at sp+0
6023     // String saved at sp+1*wordSize
6024     // Substr saved at sp+2*wordSize
6025     // Substr count saved at sp+3*wordSize
6026 
6027     // Reload substr for rescan, this code
6028     // is executed only for large substrings (> 8 chars)
6029     bind(RELOAD_SUBSTR);
6030     movptr(str2, Address(rsp, 2*wordSize));
6031     movl(cnt2, Address(rsp, 3*wordSize));
6032     if (ae == StrIntrinsicNode::UL) {
6033       pmovzxbw(vec, Address(str2, 0));
6034     } else {
6035       movdqu(vec, Address(str2, 0));
6036     }
6037     // We came here after the beginning of the substring was
6038     // matched but the rest of it was not so we need to search
6039     // again. Start from the next element after the previous match.
6040     subptr(str1, result); // Restore counter
6041     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
6042       shrl(str1, 1);
6043     }
6044     addl(cnt1, str1);
6045     decrementl(cnt1);   // Shift to next element
6046     cmpl(cnt1, cnt2);
6047     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6048 
6049     addptr(result, (1<<scale1));
6050   } // non constant
6051 
6052   // Scan string for start of substr in 16-byte vectors
6053   bind(SCAN_TO_SUBSTR);
6054   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6055   pcmpestri(vec, Address(result, 0), mode);
6056   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
6057   subl(cnt1, stride);
6058   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
6059   cmpl(cnt1, cnt2);
6060   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6061   addptr(result, 16);
6062 
6063   bind(ADJUST_STR);
6064   cmpl(cnt1, stride); // Do not read beyond string
6065   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
6066   // Back-up string to avoid reading beyond string.
6067   lea(result, Address(result, cnt1, scale1, -16));
6068   movl(cnt1, stride);
6069   jmpb(SCAN_TO_SUBSTR);
6070 
6071   // Found a potential substr
6072   bind(FOUND_CANDIDATE);
6073   // After pcmpestri tmp(rcx) contains matched element index
6074 
6075   // Make sure string is still long enough
6076   subl(cnt1, tmp);
6077   cmpl(cnt1, cnt2);
6078   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
6079   // Left less then substring.
6080 
6081   bind(RET_NOT_FOUND);
6082   movl(result, -1);
6083   jmp(CLEANUP);
6084 
6085   bind(FOUND_SUBSTR);
6086   // Compute start addr of substr
6087   lea(result, Address(result, tmp, scale1));
6088   if (int_cnt2 > 0) { // Constant substring
6089     // Repeat search for small substring (< 8 chars)
6090     // from new point without reloading substring.
6091     // Have to check that we don't read beyond string.
6092     cmpl(tmp, stride-int_cnt2);
6093     jccb(Assembler::greater, ADJUST_STR);
6094     // Fall through if matched whole substring.
6095   } else { // non constant
6096     assert(int_cnt2 == -1, "should be != 0");
6097 
6098     addl(tmp, cnt2);
6099     // Found result if we matched whole substring.
6100     cmpl(tmp, stride);
6101     jcc(Assembler::lessEqual, RET_FOUND);
6102 
6103     // Repeat search for small substring (<= 8 chars)
6104     // from new point 'str1' without reloading substring.
6105     cmpl(cnt2, stride);
6106     // Have to check that we don't read beyond string.
6107     jccb(Assembler::lessEqual, ADJUST_STR);
6108 
6109     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
6110     // Compare the rest of substring (> 8 chars).
6111     movptr(str1, result);
6112 
6113     cmpl(tmp, cnt2);
6114     // First 8 chars are already matched.
6115     jccb(Assembler::equal, CHECK_NEXT);
6116 
6117     bind(SCAN_SUBSTR);
6118     pcmpestri(vec, Address(str1, 0), mode);
6119     // Need to reload strings pointers if not matched whole vector
6120     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
6121 
6122     bind(CHECK_NEXT);
6123     subl(cnt2, stride);
6124     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
6125     addptr(str1, 16);
6126     if (ae == StrIntrinsicNode::UL) {
6127       addptr(str2, 8);
6128     } else {
6129       addptr(str2, 16);
6130     }
6131     subl(cnt1, stride);
6132     cmpl(cnt2, stride); // Do not read beyond substring
6133     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
6134     // Back-up strings to avoid reading beyond substring.
6135 
6136     if (ae == StrIntrinsicNode::UL) {
6137       lea(str2, Address(str2, cnt2, scale2, -8));
6138       lea(str1, Address(str1, cnt2, scale1, -16));
6139     } else {
6140       lea(str2, Address(str2, cnt2, scale2, -16));
6141       lea(str1, Address(str1, cnt2, scale1, -16));
6142     }
6143     subl(cnt1, cnt2);
6144     movl(cnt2, stride);
6145     addl(cnt1, stride);
6146     bind(CONT_SCAN_SUBSTR);
6147     if (ae == StrIntrinsicNode::UL) {
6148       pmovzxbw(vec, Address(str2, 0));
6149     } else {
6150       movdqu(vec, Address(str2, 0));
6151     }
6152     jmp(SCAN_SUBSTR);
6153 
6154     bind(RET_FOUND_LONG);
6155     movptr(str1, Address(rsp, wordSize));
6156   } // non constant
6157 
6158   bind(RET_FOUND);
6159   // Compute substr offset
6160   subptr(result, str1);
6161   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
6162     shrl(result, 1); // index
6163   }
6164   bind(CLEANUP);
6165   pop(rsp); // restore SP
6166 
6167 } // string_indexof
6168 
6169 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
6170                                          XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
6171   ShortBranchVerifier sbv(this);
6172   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
6173 
6174   int stride = 8;
6175 
6176   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
6177         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
6178         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
6179         FOUND_SEQ_CHAR, DONE_LABEL;
6180 
6181   movptr(result, str1);
6182   if (UseAVX >= 2) {
6183     cmpl(cnt1, stride);
6184     jcc(Assembler::less, SCAN_TO_CHAR_LOOP);
6185     cmpl(cnt1, 2*stride);
6186     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
6187     movdl(vec1, ch);
6188     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
6189     vpxor(vec2, vec2);
6190     movl(tmp, cnt1);
6191     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
6192     andl(cnt1,0x0000000F);  //tail count (in chars)
6193 
6194     bind(SCAN_TO_16_CHAR_LOOP);
6195     vmovdqu(vec3, Address(result, 0));
6196     vpcmpeqw(vec3, vec3, vec1, 1);
6197     vptest(vec2, vec3);
6198     jcc(Assembler::carryClear, FOUND_CHAR);
6199     addptr(result, 32);
6200     subl(tmp, 2*stride);
6201     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
6202     jmp(SCAN_TO_8_CHAR);
6203     bind(SCAN_TO_8_CHAR_INIT);
6204     movdl(vec1, ch);
6205     pshuflw(vec1, vec1, 0x00);
6206     pshufd(vec1, vec1, 0);
6207     pxor(vec2, vec2);
6208   }
6209   bind(SCAN_TO_8_CHAR);
6210   cmpl(cnt1, stride);
6211   if (UseAVX >= 2) {
6212     jcc(Assembler::less, SCAN_TO_CHAR);
6213   } else {
6214     jcc(Assembler::less, SCAN_TO_CHAR_LOOP);
6215     movdl(vec1, ch);
6216     pshuflw(vec1, vec1, 0x00);
6217     pshufd(vec1, vec1, 0);
6218     pxor(vec2, vec2);
6219   }
6220   movl(tmp, cnt1);
6221   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
6222   andl(cnt1,0x00000007);  //tail count (in chars)
6223 
6224   bind(SCAN_TO_8_CHAR_LOOP);
6225   movdqu(vec3, Address(result, 0));
6226   pcmpeqw(vec3, vec1);
6227   ptest(vec2, vec3);
6228   jcc(Assembler::carryClear, FOUND_CHAR);
6229   addptr(result, 16);
6230   subl(tmp, stride);
6231   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
6232   bind(SCAN_TO_CHAR);
6233   testl(cnt1, cnt1);
6234   jcc(Assembler::zero, RET_NOT_FOUND);
6235   bind(SCAN_TO_CHAR_LOOP);
6236   load_unsigned_short(tmp, Address(result, 0));
6237   cmpl(ch, tmp);
6238   jccb(Assembler::equal, FOUND_SEQ_CHAR);
6239   addptr(result, 2);
6240   subl(cnt1, 1);
6241   jccb(Assembler::zero, RET_NOT_FOUND);
6242   jmp(SCAN_TO_CHAR_LOOP);
6243 
6244   bind(RET_NOT_FOUND);
6245   movl(result, -1);
6246   jmpb(DONE_LABEL);
6247 
6248   bind(FOUND_CHAR);
6249   if (UseAVX >= 2) {
6250     vpmovmskb(tmp, vec3);
6251   } else {
6252     pmovmskb(tmp, vec3);
6253   }
6254   bsfl(ch, tmp);
6255   addl(result, ch);
6256 
6257   bind(FOUND_SEQ_CHAR);
6258   subptr(result, str1);
6259   shrl(result, 1);
6260 
6261   bind(DONE_LABEL);
6262 } // string_indexof_char
6263 
6264 // helper function for string_compare
6265 void MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
6266                                         Address::ScaleFactor scale, Address::ScaleFactor scale1,
6267                                         Address::ScaleFactor scale2, Register index, int ae) {
6268   if (ae == StrIntrinsicNode::LL) {
6269     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
6270     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
6271   } else if (ae == StrIntrinsicNode::UU) {
6272     load_unsigned_short(elem1, Address(str1, index, scale, 0));
6273     load_unsigned_short(elem2, Address(str2, index, scale, 0));
6274   } else {
6275     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
6276     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
6277   }
6278 }
6279 
6280 // Compare strings, used for char[] and byte[].
6281 void MacroAssembler::string_compare(Register str1, Register str2,
6282                                     Register cnt1, Register cnt2, Register result,
6283                                     XMMRegister vec1, int ae) {
6284   ShortBranchVerifier sbv(this);
6285   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
6286   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
6287   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
6288   int stride2x2 = 0x40;
6289   Address::ScaleFactor scale = Address::no_scale;
6290   Address::ScaleFactor scale1 = Address::no_scale;
6291   Address::ScaleFactor scale2 = Address::no_scale;
6292 
6293   if (ae != StrIntrinsicNode::LL) {
6294     stride2x2 = 0x20;
6295   }
6296 
6297   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
6298     shrl(cnt2, 1);
6299   }
6300   // Compute the minimum of the string lengths and the
6301   // difference of the string lengths (stack).
6302   // Do the conditional move stuff
6303   movl(result, cnt1);
6304   subl(cnt1, cnt2);
6305   push(cnt1);
6306   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
6307 
6308   // Is the minimum length zero?
6309   testl(cnt2, cnt2);
6310   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
6311   if (ae == StrIntrinsicNode::LL) {
6312     // Load first bytes
6313     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
6314     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
6315   } else if (ae == StrIntrinsicNode::UU) {
6316     // Load first characters
6317     load_unsigned_short(result, Address(str1, 0));
6318     load_unsigned_short(cnt1, Address(str2, 0));
6319   } else {
6320     load_unsigned_byte(result, Address(str1, 0));
6321     load_unsigned_short(cnt1, Address(str2, 0));
6322   }
6323   subl(result, cnt1);
6324   jcc(Assembler::notZero,  POP_LABEL);
6325 
6326   if (ae == StrIntrinsicNode::UU) {
6327     // Divide length by 2 to get number of chars
6328     shrl(cnt2, 1);
6329   }
6330   cmpl(cnt2, 1);
6331   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
6332 
6333   // Check if the strings start at the same location and setup scale and stride
6334   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6335     cmpptr(str1, str2);
6336     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
6337     if (ae == StrIntrinsicNode::LL) {
6338       scale = Address::times_1;
6339       stride = 16;
6340     } else {
6341       scale = Address::times_2;
6342       stride = 8;
6343     }
6344   } else {
6345     scale1 = Address::times_1;
6346     scale2 = Address::times_2;
6347     // scale not used
6348     stride = 8;
6349   }
6350 
6351   if (UseAVX >= 2 && UseSSE42Intrinsics) {
6352     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
6353     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
6354     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
6355     Label COMPARE_TAIL_LONG;
6356     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
6357 
6358     int pcmpmask = 0x19;
6359     if (ae == StrIntrinsicNode::LL) {
6360       pcmpmask &= ~0x01;
6361     }
6362 
6363     // Setup to compare 16-chars (32-bytes) vectors,
6364     // start from first character again because it has aligned address.
6365     if (ae == StrIntrinsicNode::LL) {
6366       stride2 = 32;
6367     } else {
6368       stride2 = 16;
6369     }
6370     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6371       adr_stride = stride << scale;
6372     } else {
6373       adr_stride1 = 8;  //stride << scale1;
6374       adr_stride2 = 16; //stride << scale2;
6375     }
6376 
6377     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
6378     // rax and rdx are used by pcmpestri as elements counters
6379     movl(result, cnt2);
6380     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
6381     jcc(Assembler::zero, COMPARE_TAIL_LONG);
6382 
6383     // fast path : compare first 2 8-char vectors.
6384     bind(COMPARE_16_CHARS);
6385     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6386       movdqu(vec1, Address(str1, 0));
6387     } else {
6388       pmovzxbw(vec1, Address(str1, 0));
6389     }
6390     pcmpestri(vec1, Address(str2, 0), pcmpmask);
6391     jccb(Assembler::below, COMPARE_INDEX_CHAR);
6392 
6393     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6394       movdqu(vec1, Address(str1, adr_stride));
6395       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
6396     } else {
6397       pmovzxbw(vec1, Address(str1, adr_stride1));
6398       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
6399     }
6400     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
6401     addl(cnt1, stride);
6402 
6403     // Compare the characters at index in cnt1
6404     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
6405     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
6406     subl(result, cnt2);
6407     jmp(POP_LABEL);
6408 
6409     // Setup the registers to start vector comparison loop
6410     bind(COMPARE_WIDE_VECTORS);
6411     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6412       lea(str1, Address(str1, result, scale));
6413       lea(str2, Address(str2, result, scale));
6414     } else {
6415       lea(str1, Address(str1, result, scale1));
6416       lea(str2, Address(str2, result, scale2));
6417     }
6418     subl(result, stride2);
6419     subl(cnt2, stride2);
6420     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
6421     negptr(result);
6422 
6423     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
6424     bind(COMPARE_WIDE_VECTORS_LOOP);
6425 
6426 #ifdef _LP64
6427     if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
6428       cmpl(cnt2, stride2x2);
6429       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
6430       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
6431       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
6432 
6433       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
6434       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6435         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
6436         evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
6437       } else {
6438         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
6439         evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
6440       }
6441       kortestql(k7, k7);
6442       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
6443       addptr(result, stride2x2);  // update since we already compared at this addr
6444       subl(cnt2, stride2x2);      // and sub the size too
6445       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
6446 
6447       vpxor(vec1, vec1);
6448       jmpb(COMPARE_WIDE_TAIL);
6449     }//if (VM_Version::supports_avx512vlbw())
6450 #endif // _LP64
6451 
6452 
6453     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
6454     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6455       vmovdqu(vec1, Address(str1, result, scale));
6456       vpxor(vec1, Address(str2, result, scale));
6457     } else {
6458       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
6459       vpxor(vec1, Address(str2, result, scale2));
6460     }
6461     vptest(vec1, vec1);
6462     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
6463     addptr(result, stride2);
6464     subl(cnt2, stride2);
6465     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
6466     // clean upper bits of YMM registers
6467     vpxor(vec1, vec1);
6468 
6469     // compare wide vectors tail
6470     bind(COMPARE_WIDE_TAIL);
6471     testptr(result, result);
6472     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
6473 
6474     movl(result, stride2);
6475     movl(cnt2, result);
6476     negptr(result);
6477     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
6478 
6479     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
6480     bind(VECTOR_NOT_EQUAL);
6481     // clean upper bits of YMM registers
6482     vpxor(vec1, vec1);
6483     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6484       lea(str1, Address(str1, result, scale));
6485       lea(str2, Address(str2, result, scale));
6486     } else {
6487       lea(str1, Address(str1, result, scale1));
6488       lea(str2, Address(str2, result, scale2));
6489     }
6490     jmp(COMPARE_16_CHARS);
6491 
6492     // Compare tail chars, length between 1 to 15 chars
6493     bind(COMPARE_TAIL_LONG);
6494     movl(cnt2, result);
6495     cmpl(cnt2, stride);
6496     jcc(Assembler::less, COMPARE_SMALL_STR);
6497 
6498     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6499       movdqu(vec1, Address(str1, 0));
6500     } else {
6501       pmovzxbw(vec1, Address(str1, 0));
6502     }
6503     pcmpestri(vec1, Address(str2, 0), pcmpmask);
6504     jcc(Assembler::below, COMPARE_INDEX_CHAR);
6505     subptr(cnt2, stride);
6506     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
6507     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6508       lea(str1, Address(str1, result, scale));
6509       lea(str2, Address(str2, result, scale));
6510     } else {
6511       lea(str1, Address(str1, result, scale1));
6512       lea(str2, Address(str2, result, scale2));
6513     }
6514     negptr(cnt2);
6515     jmpb(WHILE_HEAD_LABEL);
6516 
6517     bind(COMPARE_SMALL_STR);
6518   } else if (UseSSE42Intrinsics) {
6519     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
6520     int pcmpmask = 0x19;
6521     // Setup to compare 8-char (16-byte) vectors,
6522     // start from first character again because it has aligned address.
6523     movl(result, cnt2);
6524     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
6525     if (ae == StrIntrinsicNode::LL) {
6526       pcmpmask &= ~0x01;
6527     }
6528     jcc(Assembler::zero, COMPARE_TAIL);
6529     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6530       lea(str1, Address(str1, result, scale));
6531       lea(str2, Address(str2, result, scale));
6532     } else {
6533       lea(str1, Address(str1, result, scale1));
6534       lea(str2, Address(str2, result, scale2));
6535     }
6536     negptr(result);
6537 
6538     // pcmpestri
6539     //   inputs:
6540     //     vec1- substring
6541     //     rax - negative string length (elements count)
6542     //     mem - scanned string
6543     //     rdx - string length (elements count)
6544     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
6545     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
6546     //   outputs:
6547     //     rcx - first mismatched element index
6548     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
6549 
6550     bind(COMPARE_WIDE_VECTORS);
6551     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6552       movdqu(vec1, Address(str1, result, scale));
6553       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
6554     } else {
6555       pmovzxbw(vec1, Address(str1, result, scale1));
6556       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
6557     }
6558     // After pcmpestri cnt1(rcx) contains mismatched element index
6559 
6560     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
6561     addptr(result, stride);
6562     subptr(cnt2, stride);
6563     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
6564 
6565     // compare wide vectors tail
6566     testptr(result, result);
6567     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
6568 
6569     movl(cnt2, stride);
6570     movl(result, stride);
6571     negptr(result);
6572     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6573       movdqu(vec1, Address(str1, result, scale));
6574       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
6575     } else {
6576       pmovzxbw(vec1, Address(str1, result, scale1));
6577       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
6578     }
6579     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
6580 
6581     // Mismatched characters in the vectors
6582     bind(VECTOR_NOT_EQUAL);
6583     addptr(cnt1, result);
6584     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
6585     subl(result, cnt2);
6586     jmpb(POP_LABEL);
6587 
6588     bind(COMPARE_TAIL); // limit is zero
6589     movl(cnt2, result);
6590     // Fallthru to tail compare
6591   }
6592   // Shift str2 and str1 to the end of the arrays, negate min
6593   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
6594     lea(str1, Address(str1, cnt2, scale));
6595     lea(str2, Address(str2, cnt2, scale));
6596   } else {
6597     lea(str1, Address(str1, cnt2, scale1));
6598     lea(str2, Address(str2, cnt2, scale2));
6599   }
6600   decrementl(cnt2);  // first character was compared already
6601   negptr(cnt2);
6602 
6603   // Compare the rest of the elements
6604   bind(WHILE_HEAD_LABEL);
6605   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
6606   subl(result, cnt1);
6607   jccb(Assembler::notZero, POP_LABEL);
6608   increment(cnt2);
6609   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
6610 
6611   // Strings are equal up to min length.  Return the length difference.
6612   bind(LENGTH_DIFF_LABEL);
6613   pop(result);
6614   if (ae == StrIntrinsicNode::UU) {
6615     // Divide diff by 2 to get number of chars
6616     sarl(result, 1);
6617   }
6618   jmpb(DONE_LABEL);
6619 
6620 #ifdef _LP64
6621   if (VM_Version::supports_avx512vlbw()) {
6622 
6623     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
6624 
6625     kmovql(cnt1, k7);
6626     notq(cnt1);
6627     bsfq(cnt2, cnt1);
6628     if (ae != StrIntrinsicNode::LL) {
6629       // Divide diff by 2 to get number of chars
6630       sarl(cnt2, 1);
6631     }
6632     addq(result, cnt2);
6633     if (ae == StrIntrinsicNode::LL) {
6634       load_unsigned_byte(cnt1, Address(str2, result));
6635       load_unsigned_byte(result, Address(str1, result));
6636     } else if (ae == StrIntrinsicNode::UU) {
6637       load_unsigned_short(cnt1, Address(str2, result, scale));
6638       load_unsigned_short(result, Address(str1, result, scale));
6639     } else {
6640       load_unsigned_short(cnt1, Address(str2, result, scale2));
6641       load_unsigned_byte(result, Address(str1, result, scale1));
6642     }
6643     subl(result, cnt1);
6644     jmpb(POP_LABEL);
6645   }//if (VM_Version::supports_avx512vlbw())
6646 #endif // _LP64
6647 
6648   // Discard the stored length difference
6649   bind(POP_LABEL);
6650   pop(cnt1);
6651 
6652   // That's it
6653   bind(DONE_LABEL);
6654   if(ae == StrIntrinsicNode::UL) {
6655     negl(result);
6656   }
6657 
6658 }
6659 
6660 // Search for Non-ASCII character (Negative byte value) in a byte array,
6661 // return true if it has any and false otherwise.
6662 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
6663 //   @HotSpotIntrinsicCandidate
6664 //   private static boolean hasNegatives(byte[] ba, int off, int len) {
6665 //     for (int i = off; i < off + len; i++) {
6666 //       if (ba[i] < 0) {
6667 //         return true;
6668 //       }
6669 //     }
6670 //     return false;
6671 //   }
6672 void MacroAssembler::has_negatives(Register ary1, Register len,
6673   Register result, Register tmp1,
6674   XMMRegister vec1, XMMRegister vec2) {
6675   // rsi: byte array
6676   // rcx: len
6677   // rax: result
6678   ShortBranchVerifier sbv(this);
6679   assert_different_registers(ary1, len, result, tmp1);
6680   assert_different_registers(vec1, vec2);
6681   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
6682 
6683   // len == 0
6684   testl(len, len);
6685   jcc(Assembler::zero, FALSE_LABEL);
6686 
6687   if ((UseAVX > 2) && // AVX512
6688     VM_Version::supports_avx512vlbw() &&
6689     VM_Version::supports_bmi2()) {
6690 
6691     Label test_64_loop, test_tail;
6692     Register tmp3_aliased = len;
6693 
6694     movl(tmp1, len);
6695     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
6696 
6697     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
6698     andl(len, ~(64 - 1));    // vector count (in chars)
6699     jccb(Assembler::zero, test_tail);
6700 
6701     lea(ary1, Address(ary1, len, Address::times_1));
6702     negptr(len);
6703 
6704     bind(test_64_loop);
6705     // Check whether our 64 elements of size byte contain negatives
6706     evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
6707     kortestql(k2, k2);
6708     jcc(Assembler::notZero, TRUE_LABEL);
6709 
6710     addptr(len, 64);
6711     jccb(Assembler::notZero, test_64_loop);
6712 
6713 
6714     bind(test_tail);
6715     // bail out when there is nothing to be done
6716     testl(tmp1, -1);
6717     jcc(Assembler::zero, FALSE_LABEL);
6718 
6719     // ~(~0 << len) applied up to two times (for 32-bit scenario)
6720 #ifdef _LP64
6721     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
6722     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
6723     notq(tmp3_aliased);
6724     kmovql(k3, tmp3_aliased);
6725 #else
6726     Label k_init;
6727     jmp(k_init);
6728 
6729     // We could not read 64-bits from a general purpose register thus we move
6730     // data required to compose 64 1's to the instruction stream
6731     // We emit 64 byte wide series of elements from 0..63 which later on would
6732     // be used as a compare targets with tail count contained in tmp1 register.
6733     // Result would be a k register having tmp1 consecutive number or 1
6734     // counting from least significant bit.
6735     address tmp = pc();
6736     emit_int64(0x0706050403020100);
6737     emit_int64(0x0F0E0D0C0B0A0908);
6738     emit_int64(0x1716151413121110);
6739     emit_int64(0x1F1E1D1C1B1A1918);
6740     emit_int64(0x2726252423222120);
6741     emit_int64(0x2F2E2D2C2B2A2928);
6742     emit_int64(0x3736353433323130);
6743     emit_int64(0x3F3E3D3C3B3A3938);
6744 
6745     bind(k_init);
6746     lea(len, InternalAddress(tmp));
6747     // create mask to test for negative byte inside a vector
6748     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
6749     evpcmpgtb(k3, vec1, Address(len, 0), Assembler::AVX_512bit);
6750 
6751 #endif
6752     evpcmpgtb(k2, k3, vec2, Address(ary1, 0), Assembler::AVX_512bit);
6753     ktestq(k2, k3);
6754     jcc(Assembler::notZero, TRUE_LABEL);
6755 
6756     jmp(FALSE_LABEL);
6757   } else {
6758     movl(result, len); // copy
6759 
6760     if (UseAVX == 2 && UseSSE >= 2) {
6761       // With AVX2, use 32-byte vector compare
6762       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
6763 
6764       // Compare 32-byte vectors
6765       andl(result, 0x0000001f);  //   tail count (in bytes)
6766       andl(len, 0xffffffe0);   // vector count (in bytes)
6767       jccb(Assembler::zero, COMPARE_TAIL);
6768 
6769       lea(ary1, Address(ary1, len, Address::times_1));
6770       negptr(len);
6771 
6772       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
6773       movdl(vec2, tmp1);
6774       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
6775 
6776       bind(COMPARE_WIDE_VECTORS);
6777       vmovdqu(vec1, Address(ary1, len, Address::times_1));
6778       vptest(vec1, vec2);
6779       jccb(Assembler::notZero, TRUE_LABEL);
6780       addptr(len, 32);
6781       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
6782 
6783       testl(result, result);
6784       jccb(Assembler::zero, FALSE_LABEL);
6785 
6786       vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
6787       vptest(vec1, vec2);
6788       jccb(Assembler::notZero, TRUE_LABEL);
6789       jmpb(FALSE_LABEL);
6790 
6791       bind(COMPARE_TAIL); // len is zero
6792       movl(len, result);
6793       // Fallthru to tail compare
6794     } else if (UseSSE42Intrinsics) {
6795       // With SSE4.2, use double quad vector compare
6796       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
6797 
6798       // Compare 16-byte vectors
6799       andl(result, 0x0000000f);  //   tail count (in bytes)
6800       andl(len, 0xfffffff0);   // vector count (in bytes)
6801       jcc(Assembler::zero, COMPARE_TAIL);
6802 
6803       lea(ary1, Address(ary1, len, Address::times_1));
6804       negptr(len);
6805 
6806       movl(tmp1, 0x80808080);
6807       movdl(vec2, tmp1);
6808       pshufd(vec2, vec2, 0);
6809 
6810       bind(COMPARE_WIDE_VECTORS);
6811       movdqu(vec1, Address(ary1, len, Address::times_1));
6812       ptest(vec1, vec2);
6813       jcc(Assembler::notZero, TRUE_LABEL);
6814       addptr(len, 16);
6815       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
6816 
6817       testl(result, result);
6818       jcc(Assembler::zero, FALSE_LABEL);
6819 
6820       movdqu(vec1, Address(ary1, result, Address::times_1, -16));
6821       ptest(vec1, vec2);
6822       jccb(Assembler::notZero, TRUE_LABEL);
6823       jmpb(FALSE_LABEL);
6824 
6825       bind(COMPARE_TAIL); // len is zero
6826       movl(len, result);
6827       // Fallthru to tail compare
6828     }
6829   }
6830   // Compare 4-byte vectors
6831   andl(len, 0xfffffffc); // vector count (in bytes)
6832   jccb(Assembler::zero, COMPARE_CHAR);
6833 
6834   lea(ary1, Address(ary1, len, Address::times_1));
6835   negptr(len);
6836 
6837   bind(COMPARE_VECTORS);
6838   movl(tmp1, Address(ary1, len, Address::times_1));
6839   andl(tmp1, 0x80808080);
6840   jccb(Assembler::notZero, TRUE_LABEL);
6841   addptr(len, 4);
6842   jcc(Assembler::notZero, COMPARE_VECTORS);
6843 
6844   // Compare trailing char (final 2 bytes), if any
6845   bind(COMPARE_CHAR);
6846   testl(result, 0x2);   // tail  char
6847   jccb(Assembler::zero, COMPARE_BYTE);
6848   load_unsigned_short(tmp1, Address(ary1, 0));
6849   andl(tmp1, 0x00008080);
6850   jccb(Assembler::notZero, TRUE_LABEL);
6851   subptr(result, 2);
6852   lea(ary1, Address(ary1, 2));
6853 
6854   bind(COMPARE_BYTE);
6855   testl(result, 0x1);   // tail  byte
6856   jccb(Assembler::zero, FALSE_LABEL);
6857   load_unsigned_byte(tmp1, Address(ary1, 0));
6858   andl(tmp1, 0x00000080);
6859   jccb(Assembler::notEqual, TRUE_LABEL);
6860   jmpb(FALSE_LABEL);
6861 
6862   bind(TRUE_LABEL);
6863   movl(result, 1);   // return true
6864   jmpb(DONE);
6865 
6866   bind(FALSE_LABEL);
6867   xorl(result, result); // return false
6868 
6869   // That's it
6870   bind(DONE);
6871   if (UseAVX >= 2 && UseSSE >= 2) {
6872     // clean upper bits of YMM registers
6873     vpxor(vec1, vec1);
6874     vpxor(vec2, vec2);
6875   }
6876 }
6877 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
6878 void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
6879                                    Register limit, Register result, Register chr,
6880                                    XMMRegister vec1, XMMRegister vec2, bool is_char) {
6881   ShortBranchVerifier sbv(this);
6882   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
6883 
6884   int length_offset  = arrayOopDesc::length_offset_in_bytes();
6885   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
6886 
6887   if (is_array_equ) {
6888     // Check the input args
6889     cmpoop(ary1, ary2);
6890     jcc(Assembler::equal, TRUE_LABEL);
6891 
6892     // Need additional checks for arrays_equals.
6893     testptr(ary1, ary1);
6894     jcc(Assembler::zero, FALSE_LABEL);
6895     testptr(ary2, ary2);
6896     jcc(Assembler::zero, FALSE_LABEL);
6897 
6898     // Check the lengths
6899     movl(limit, Address(ary1, length_offset));
6900     cmpl(limit, Address(ary2, length_offset));
6901     jcc(Assembler::notEqual, FALSE_LABEL);
6902   }
6903 
6904   // count == 0
6905   testl(limit, limit);
6906   jcc(Assembler::zero, TRUE_LABEL);
6907 
6908   if (is_array_equ) {
6909     // Load array address
6910     lea(ary1, Address(ary1, base_offset));
6911     lea(ary2, Address(ary2, base_offset));
6912   }
6913 
6914   if (is_array_equ && is_char) {
6915     // arrays_equals when used for char[].
6916     shll(limit, 1);      // byte count != 0
6917   }
6918   movl(result, limit); // copy
6919 
6920   if (UseAVX >= 2) {
6921     // With AVX2, use 32-byte vector compare
6922     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
6923 
6924     // Compare 32-byte vectors
6925     andl(result, 0x0000001f);  //   tail count (in bytes)
6926     andl(limit, 0xffffffe0);   // vector count (in bytes)
6927     jcc(Assembler::zero, COMPARE_TAIL);
6928 
6929     lea(ary1, Address(ary1, limit, Address::times_1));
6930     lea(ary2, Address(ary2, limit, Address::times_1));
6931     negptr(limit);
6932 
6933     bind(COMPARE_WIDE_VECTORS);
6934 
6935 #ifdef _LP64
6936     if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
6937       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
6938 
6939       cmpl(limit, -64);
6940       jccb(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
6941 
6942       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
6943 
6944       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
6945       evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
6946       kortestql(k7, k7);
6947       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
6948       addptr(limit, 64);  // update since we already compared at this addr
6949       cmpl(limit, -64);
6950       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
6951 
6952       // At this point we may still need to compare -limit+result bytes.
6953       // We could execute the next two instruction and just continue via non-wide path:
6954       //  cmpl(limit, 0);
6955       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
6956       // But since we stopped at the points ary{1,2}+limit which are
6957       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
6958       // (|limit| <= 32 and result < 32),
6959       // we may just compare the last 64 bytes.
6960       //
6961       addptr(result, -64);   // it is safe, bc we just came from this area
6962       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
6963       evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
6964       kortestql(k7, k7);
6965       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
6966 
6967       jmp(TRUE_LABEL);
6968 
6969       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
6970 
6971     }//if (VM_Version::supports_avx512vlbw())
6972 #endif //_LP64
6973 
6974     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
6975     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
6976     vpxor(vec1, vec2);
6977 
6978     vptest(vec1, vec1);
6979     jcc(Assembler::notZero, FALSE_LABEL);
6980     addptr(limit, 32);
6981     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
6982 
6983     testl(result, result);
6984     jcc(Assembler::zero, TRUE_LABEL);
6985 
6986     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
6987     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
6988     vpxor(vec1, vec2);
6989 
6990     vptest(vec1, vec1);
6991     jccb(Assembler::notZero, FALSE_LABEL);
6992     jmpb(TRUE_LABEL);
6993 
6994     bind(COMPARE_TAIL); // limit is zero
6995     movl(limit, result);
6996     // Fallthru to tail compare
6997   } else if (UseSSE42Intrinsics) {
6998     // With SSE4.2, use double quad vector compare
6999     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7000 
7001     // Compare 16-byte vectors
7002     andl(result, 0x0000000f);  //   tail count (in bytes)
7003     andl(limit, 0xfffffff0);   // vector count (in bytes)
7004     jcc(Assembler::zero, COMPARE_TAIL);
7005 
7006     lea(ary1, Address(ary1, limit, Address::times_1));
7007     lea(ary2, Address(ary2, limit, Address::times_1));
7008     negptr(limit);
7009 
7010     bind(COMPARE_WIDE_VECTORS);
7011     movdqu(vec1, Address(ary1, limit, Address::times_1));
7012     movdqu(vec2, Address(ary2, limit, Address::times_1));
7013     pxor(vec1, vec2);
7014 
7015     ptest(vec1, vec1);
7016     jcc(Assembler::notZero, FALSE_LABEL);
7017     addptr(limit, 16);
7018     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7019 
7020     testl(result, result);
7021     jcc(Assembler::zero, TRUE_LABEL);
7022 
7023     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
7024     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
7025     pxor(vec1, vec2);
7026 
7027     ptest(vec1, vec1);
7028     jccb(Assembler::notZero, FALSE_LABEL);
7029     jmpb(TRUE_LABEL);
7030 
7031     bind(COMPARE_TAIL); // limit is zero
7032     movl(limit, result);
7033     // Fallthru to tail compare
7034   }
7035 
7036   // Compare 4-byte vectors
7037   andl(limit, 0xfffffffc); // vector count (in bytes)
7038   jccb(Assembler::zero, COMPARE_CHAR);
7039 
7040   lea(ary1, Address(ary1, limit, Address::times_1));
7041   lea(ary2, Address(ary2, limit, Address::times_1));
7042   negptr(limit);
7043 
7044   bind(COMPARE_VECTORS);
7045   movl(chr, Address(ary1, limit, Address::times_1));
7046   cmpl(chr, Address(ary2, limit, Address::times_1));
7047   jccb(Assembler::notEqual, FALSE_LABEL);
7048   addptr(limit, 4);
7049   jcc(Assembler::notZero, COMPARE_VECTORS);
7050 
7051   // Compare trailing char (final 2 bytes), if any
7052   bind(COMPARE_CHAR);
7053   testl(result, 0x2);   // tail  char
7054   jccb(Assembler::zero, COMPARE_BYTE);
7055   load_unsigned_short(chr, Address(ary1, 0));
7056   load_unsigned_short(limit, Address(ary2, 0));
7057   cmpl(chr, limit);
7058   jccb(Assembler::notEqual, FALSE_LABEL);
7059 
7060   if (is_array_equ && is_char) {
7061     bind(COMPARE_BYTE);
7062   } else {
7063     lea(ary1, Address(ary1, 2));
7064     lea(ary2, Address(ary2, 2));
7065 
7066     bind(COMPARE_BYTE);
7067     testl(result, 0x1);   // tail  byte
7068     jccb(Assembler::zero, TRUE_LABEL);
7069     load_unsigned_byte(chr, Address(ary1, 0));
7070     load_unsigned_byte(limit, Address(ary2, 0));
7071     cmpl(chr, limit);
7072     jccb(Assembler::notEqual, FALSE_LABEL);
7073   }
7074   bind(TRUE_LABEL);
7075   movl(result, 1);   // return true
7076   jmpb(DONE);
7077 
7078   bind(FALSE_LABEL);
7079   xorl(result, result); // return false
7080 
7081   // That's it
7082   bind(DONE);
7083   if (UseAVX >= 2) {
7084     // clean upper bits of YMM registers
7085     vpxor(vec1, vec1);
7086     vpxor(vec2, vec2);
7087   }
7088 }
7089 
7090 #endif
7091 
7092 void MacroAssembler::generate_fill(BasicType t, bool aligned,
7093                                    Register to, Register value, Register count,
7094                                    Register rtmp, XMMRegister xtmp) {
7095   ShortBranchVerifier sbv(this);
7096   assert_different_registers(to, value, count, rtmp);
7097   Label L_exit;
7098   Label L_fill_2_bytes, L_fill_4_bytes;
7099 
7100   int shift = -1;
7101   switch (t) {
7102     case T_BYTE:
7103       shift = 2;
7104       break;
7105     case T_SHORT:
7106       shift = 1;
7107       break;
7108     case T_INT:
7109       shift = 0;
7110       break;
7111     default: ShouldNotReachHere();
7112   }
7113 
7114   if (t == T_BYTE) {
7115     andl(value, 0xff);
7116     movl(rtmp, value);
7117     shll(rtmp, 8);
7118     orl(value, rtmp);
7119   }
7120   if (t == T_SHORT) {
7121     andl(value, 0xffff);
7122   }
7123   if (t == T_BYTE || t == T_SHORT) {
7124     movl(rtmp, value);
7125     shll(rtmp, 16);
7126     orl(value, rtmp);
7127   }
7128 
7129   cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
7130   jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
7131   if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
7132     Label L_skip_align2;
7133     // align source address at 4 bytes address boundary
7134     if (t == T_BYTE) {
7135       Label L_skip_align1;
7136       // One byte misalignment happens only for byte arrays
7137       testptr(to, 1);
7138       jccb(Assembler::zero, L_skip_align1);
7139       movb(Address(to, 0), value);
7140       increment(to);
7141       decrement(count);
7142       BIND(L_skip_align1);
7143     }
7144     // Two bytes misalignment happens only for byte and short (char) arrays
7145     testptr(to, 2);
7146     jccb(Assembler::zero, L_skip_align2);
7147     movw(Address(to, 0), value);
7148     addptr(to, 2);
7149     subl(count, 1<<(shift-1));
7150     BIND(L_skip_align2);
7151   }
7152   if (UseSSE < 2) {
7153     Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
7154     // Fill 32-byte chunks
7155     subl(count, 8 << shift);
7156     jcc(Assembler::less, L_check_fill_8_bytes);
7157     align(16);
7158 
7159     BIND(L_fill_32_bytes_loop);
7160 
7161     for (int i = 0; i < 32; i += 4) {
7162       movl(Address(to, i), value);
7163     }
7164 
7165     addptr(to, 32);
7166     subl(count, 8 << shift);
7167     jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
7168     BIND(L_check_fill_8_bytes);
7169     addl(count, 8 << shift);
7170     jccb(Assembler::zero, L_exit);
7171     jmpb(L_fill_8_bytes);
7172 
7173     //
7174     // length is too short, just fill qwords
7175     //
7176     BIND(L_fill_8_bytes_loop);
7177     movl(Address(to, 0), value);
7178     movl(Address(to, 4), value);
7179     addptr(to, 8);
7180     BIND(L_fill_8_bytes);
7181     subl(count, 1 << (shift + 1));
7182     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7183     // fall through to fill 4 bytes
7184   } else {
7185     Label L_fill_32_bytes;
7186     if (!UseUnalignedLoadStores) {
7187       // align to 8 bytes, we know we are 4 byte aligned to start
7188       testptr(to, 4);
7189       jccb(Assembler::zero, L_fill_32_bytes);
7190       movl(Address(to, 0), value);
7191       addptr(to, 4);
7192       subl(count, 1<<shift);
7193     }
7194     BIND(L_fill_32_bytes);
7195     {
7196       assert( UseSSE >= 2, "supported cpu only" );
7197       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
7198       movdl(xtmp, value);
7199       if (UseAVX > 2 && UseUnalignedLoadStores) {
7200         // Fill 64-byte chunks
7201         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7202         vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
7203 
7204         subl(count, 16 << shift);
7205         jcc(Assembler::less, L_check_fill_32_bytes);
7206         align(16);
7207 
7208         BIND(L_fill_64_bytes_loop);
7209         evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
7210         addptr(to, 64);
7211         subl(count, 16 << shift);
7212         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7213 
7214         BIND(L_check_fill_32_bytes);
7215         addl(count, 8 << shift);
7216         jccb(Assembler::less, L_check_fill_8_bytes);
7217         vmovdqu(Address(to, 0), xtmp);
7218         addptr(to, 32);
7219         subl(count, 8 << shift);
7220 
7221         BIND(L_check_fill_8_bytes);
7222       } else if (UseAVX == 2 && UseUnalignedLoadStores) {
7223         // Fill 64-byte chunks
7224         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7225         vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
7226 
7227         subl(count, 16 << shift);
7228         jcc(Assembler::less, L_check_fill_32_bytes);
7229         align(16);
7230 
7231         BIND(L_fill_64_bytes_loop);
7232         vmovdqu(Address(to, 0), xtmp);
7233         vmovdqu(Address(to, 32), xtmp);
7234         addptr(to, 64);
7235         subl(count, 16 << shift);
7236         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7237 
7238         BIND(L_check_fill_32_bytes);
7239         addl(count, 8 << shift);
7240         jccb(Assembler::less, L_check_fill_8_bytes);
7241         vmovdqu(Address(to, 0), xtmp);
7242         addptr(to, 32);
7243         subl(count, 8 << shift);
7244 
7245         BIND(L_check_fill_8_bytes);
7246         // clean upper bits of YMM registers
7247         movdl(xtmp, value);
7248         pshufd(xtmp, xtmp, 0);
7249       } else {
7250         // Fill 32-byte chunks
7251         pshufd(xtmp, xtmp, 0);
7252 
7253         subl(count, 8 << shift);
7254         jcc(Assembler::less, L_check_fill_8_bytes);
7255         align(16);
7256 
7257         BIND(L_fill_32_bytes_loop);
7258 
7259         if (UseUnalignedLoadStores) {
7260           movdqu(Address(to, 0), xtmp);
7261           movdqu(Address(to, 16), xtmp);
7262         } else {
7263           movq(Address(to, 0), xtmp);
7264           movq(Address(to, 8), xtmp);
7265           movq(Address(to, 16), xtmp);
7266           movq(Address(to, 24), xtmp);
7267         }
7268 
7269         addptr(to, 32);
7270         subl(count, 8 << shift);
7271         jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
7272 
7273         BIND(L_check_fill_8_bytes);
7274       }
7275       addl(count, 8 << shift);
7276       jccb(Assembler::zero, L_exit);
7277       jmpb(L_fill_8_bytes);
7278 
7279       //
7280       // length is too short, just fill qwords
7281       //
7282       BIND(L_fill_8_bytes_loop);
7283       movq(Address(to, 0), xtmp);
7284       addptr(to, 8);
7285       BIND(L_fill_8_bytes);
7286       subl(count, 1 << (shift + 1));
7287       jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7288     }
7289   }
7290   // fill trailing 4 bytes
7291   BIND(L_fill_4_bytes);
7292   testl(count, 1<<shift);
7293   jccb(Assembler::zero, L_fill_2_bytes);
7294   movl(Address(to, 0), value);
7295   if (t == T_BYTE || t == T_SHORT) {
7296     Label L_fill_byte;
7297     addptr(to, 4);
7298     BIND(L_fill_2_bytes);
7299     // fill trailing 2 bytes
7300     testl(count, 1<<(shift-1));
7301     jccb(Assembler::zero, L_fill_byte);
7302     movw(Address(to, 0), value);
7303     if (t == T_BYTE) {
7304       addptr(to, 2);
7305       BIND(L_fill_byte);
7306       // fill trailing byte
7307       testl(count, 1);
7308       jccb(Assembler::zero, L_exit);
7309       movb(Address(to, 0), value);
7310     } else {
7311       BIND(L_fill_byte);
7312     }
7313   } else {
7314     BIND(L_fill_2_bytes);
7315   }
7316   BIND(L_exit);
7317 }
7318 
7319 // encode char[] to byte[] in ISO_8859_1
7320    //@HotSpotIntrinsicCandidate
7321    //private static int implEncodeISOArray(byte[] sa, int sp,
7322    //byte[] da, int dp, int len) {
7323    //  int i = 0;
7324    //  for (; i < len; i++) {
7325    //    char c = StringUTF16.getChar(sa, sp++);
7326    //    if (c > '\u00FF')
7327    //      break;
7328    //    da[dp++] = (byte)c;
7329    //  }
7330    //  return i;
7331    //}
7332 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
7333   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
7334   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
7335   Register tmp5, Register result) {
7336 
7337   // rsi: src
7338   // rdi: dst
7339   // rdx: len
7340   // rcx: tmp5
7341   // rax: result
7342   ShortBranchVerifier sbv(this);
7343   assert_different_registers(src, dst, len, tmp5, result);
7344   Label L_done, L_copy_1_char, L_copy_1_char_exit;
7345 
7346   // set result
7347   xorl(result, result);
7348   // check for zero length
7349   testl(len, len);
7350   jcc(Assembler::zero, L_done);
7351 
7352   movl(result, len);
7353 
7354   // Setup pointers
7355   lea(src, Address(src, len, Address::times_2)); // char[]
7356   lea(dst, Address(dst, len, Address::times_1)); // byte[]
7357   negptr(len);
7358 
7359   if (UseSSE42Intrinsics || UseAVX >= 2) {
7360     Label L_copy_8_chars, L_copy_8_chars_exit;
7361     Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
7362 
7363     if (UseAVX >= 2) {
7364       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
7365       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
7366       movdl(tmp1Reg, tmp5);
7367       vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
7368       jmp(L_chars_32_check);
7369 
7370       bind(L_copy_32_chars);
7371       vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
7372       vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
7373       vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
7374       vptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
7375       jccb(Assembler::notZero, L_copy_32_chars_exit);
7376       vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
7377       vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
7378       vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
7379 
7380       bind(L_chars_32_check);
7381       addptr(len, 32);
7382       jcc(Assembler::lessEqual, L_copy_32_chars);
7383 
7384       bind(L_copy_32_chars_exit);
7385       subptr(len, 16);
7386       jccb(Assembler::greater, L_copy_16_chars_exit);
7387 
7388     } else if (UseSSE42Intrinsics) {
7389       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
7390       movdl(tmp1Reg, tmp5);
7391       pshufd(tmp1Reg, tmp1Reg, 0);
7392       jmpb(L_chars_16_check);
7393     }
7394 
7395     bind(L_copy_16_chars);
7396     if (UseAVX >= 2) {
7397       vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
7398       vptest(tmp2Reg, tmp1Reg);
7399       jcc(Assembler::notZero, L_copy_16_chars_exit);
7400       vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
7401       vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
7402     } else {
7403       if (UseAVX > 0) {
7404         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7405         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7406         vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
7407       } else {
7408         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7409         por(tmp2Reg, tmp3Reg);
7410         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7411         por(tmp2Reg, tmp4Reg);
7412       }
7413       ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
7414       jccb(Assembler::notZero, L_copy_16_chars_exit);
7415       packuswb(tmp3Reg, tmp4Reg);
7416     }
7417     movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
7418 
7419     bind(L_chars_16_check);
7420     addptr(len, 16);
7421     jcc(Assembler::lessEqual, L_copy_16_chars);
7422 
7423     bind(L_copy_16_chars_exit);
7424     if (UseAVX >= 2) {
7425       // clean upper bits of YMM registers
7426       vpxor(tmp2Reg, tmp2Reg);
7427       vpxor(tmp3Reg, tmp3Reg);
7428       vpxor(tmp4Reg, tmp4Reg);
7429       movdl(tmp1Reg, tmp5);
7430       pshufd(tmp1Reg, tmp1Reg, 0);
7431     }
7432     subptr(len, 8);
7433     jccb(Assembler::greater, L_copy_8_chars_exit);
7434 
7435     bind(L_copy_8_chars);
7436     movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
7437     ptest(tmp3Reg, tmp1Reg);
7438     jccb(Assembler::notZero, L_copy_8_chars_exit);
7439     packuswb(tmp3Reg, tmp1Reg);
7440     movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
7441     addptr(len, 8);
7442     jccb(Assembler::lessEqual, L_copy_8_chars);
7443 
7444     bind(L_copy_8_chars_exit);
7445     subptr(len, 8);
7446     jccb(Assembler::zero, L_done);
7447   }
7448 
7449   bind(L_copy_1_char);
7450   load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
7451   testl(tmp5, 0xff00);      // check if Unicode char
7452   jccb(Assembler::notZero, L_copy_1_char_exit);
7453   movb(Address(dst, len, Address::times_1, 0), tmp5);
7454   addptr(len, 1);
7455   jccb(Assembler::less, L_copy_1_char);
7456 
7457   bind(L_copy_1_char_exit);
7458   addptr(result, len); // len is negative count of not processed elements
7459 
7460   bind(L_done);
7461 }
7462 
7463 #ifdef _LP64
7464 /**
7465  * Helper for multiply_to_len().
7466  */
7467 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
7468   addq(dest_lo, src1);
7469   adcq(dest_hi, 0);
7470   addq(dest_lo, src2);
7471   adcq(dest_hi, 0);
7472 }
7473 
7474 /**
7475  * Multiply 64 bit by 64 bit first loop.
7476  */
7477 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
7478                                            Register y, Register y_idx, Register z,
7479                                            Register carry, Register product,
7480                                            Register idx, Register kdx) {
7481   //
7482   //  jlong carry, x[], y[], z[];
7483   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
7484   //    huge_128 product = y[idx] * x[xstart] + carry;
7485   //    z[kdx] = (jlong)product;
7486   //    carry  = (jlong)(product >>> 64);
7487   //  }
7488   //  z[xstart] = carry;
7489   //
7490 
7491   Label L_first_loop, L_first_loop_exit;
7492   Label L_one_x, L_one_y, L_multiply;
7493 
7494   decrementl(xstart);
7495   jcc(Assembler::negative, L_one_x);
7496 
7497   movq(x_xstart, Address(x, xstart, Address::times_4,  0));
7498   rorq(x_xstart, 32); // convert big-endian to little-endian
7499 
7500   bind(L_first_loop);
7501   decrementl(idx);
7502   jcc(Assembler::negative, L_first_loop_exit);
7503   decrementl(idx);
7504   jcc(Assembler::negative, L_one_y);
7505   movq(y_idx, Address(y, idx, Address::times_4,  0));
7506   rorq(y_idx, 32); // convert big-endian to little-endian
7507   bind(L_multiply);
7508   movq(product, x_xstart);
7509   mulq(y_idx); // product(rax) * y_idx -> rdx:rax
7510   addq(product, carry);
7511   adcq(rdx, 0);
7512   subl(kdx, 2);
7513   movl(Address(z, kdx, Address::times_4,  4), product);
7514   shrq(product, 32);
7515   movl(Address(z, kdx, Address::times_4,  0), product);
7516   movq(carry, rdx);
7517   jmp(L_first_loop);
7518 
7519   bind(L_one_y);
7520   movl(y_idx, Address(y,  0));
7521   jmp(L_multiply);
7522 
7523   bind(L_one_x);
7524   movl(x_xstart, Address(x,  0));
7525   jmp(L_first_loop);
7526 
7527   bind(L_first_loop_exit);
7528 }
7529 
7530 /**
7531  * Multiply 64 bit by 64 bit and add 128 bit.
7532  */
7533 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
7534                                             Register yz_idx, Register idx,
7535                                             Register carry, Register product, int offset) {
7536   //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
7537   //     z[kdx] = (jlong)product;
7538 
7539   movq(yz_idx, Address(y, idx, Address::times_4,  offset));
7540   rorq(yz_idx, 32); // convert big-endian to little-endian
7541   movq(product, x_xstart);
7542   mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
7543   movq(yz_idx, Address(z, idx, Address::times_4,  offset));
7544   rorq(yz_idx, 32); // convert big-endian to little-endian
7545 
7546   add2_with_carry(rdx, product, carry, yz_idx);
7547 
7548   movl(Address(z, idx, Address::times_4,  offset+4), product);
7549   shrq(product, 32);
7550   movl(Address(z, idx, Address::times_4,  offset), product);
7551 
7552 }
7553 
7554 /**
7555  * Multiply 128 bit by 128 bit. Unrolled inner loop.
7556  */
7557 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
7558                                              Register yz_idx, Register idx, Register jdx,
7559                                              Register carry, Register product,
7560                                              Register carry2) {
7561   //   jlong carry, x[], y[], z[];
7562   //   int kdx = ystart+1;
7563   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
7564   //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
7565   //     z[kdx+idx+1] = (jlong)product;
7566   //     jlong carry2  = (jlong)(product >>> 64);
7567   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
7568   //     z[kdx+idx] = (jlong)product;
7569   //     carry  = (jlong)(product >>> 64);
7570   //   }
7571   //   idx += 2;
7572   //   if (idx > 0) {
7573   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
7574   //     z[kdx+idx] = (jlong)product;
7575   //     carry  = (jlong)(product >>> 64);
7576   //   }
7577   //
7578 
7579   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
7580 
7581   movl(jdx, idx);
7582   andl(jdx, 0xFFFFFFFC);
7583   shrl(jdx, 2);
7584 
7585   bind(L_third_loop);
7586   subl(jdx, 1);
7587   jcc(Assembler::negative, L_third_loop_exit);
7588   subl(idx, 4);
7589 
7590   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
7591   movq(carry2, rdx);
7592 
7593   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
7594   movq(carry, rdx);
7595   jmp(L_third_loop);
7596 
7597   bind (L_third_loop_exit);
7598 
7599   andl (idx, 0x3);
7600   jcc(Assembler::zero, L_post_third_loop_done);
7601 
7602   Label L_check_1;
7603   subl(idx, 2);
7604   jcc(Assembler::negative, L_check_1);
7605 
7606   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
7607   movq(carry, rdx);
7608 
7609   bind (L_check_1);
7610   addl (idx, 0x2);
7611   andl (idx, 0x1);
7612   subl(idx, 1);
7613   jcc(Assembler::negative, L_post_third_loop_done);
7614 
7615   movl(yz_idx, Address(y, idx, Address::times_4,  0));
7616   movq(product, x_xstart);
7617   mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
7618   movl(yz_idx, Address(z, idx, Address::times_4,  0));
7619 
7620   add2_with_carry(rdx, product, yz_idx, carry);
7621 
7622   movl(Address(z, idx, Address::times_4,  0), product);
7623   shrq(product, 32);
7624 
7625   shlq(rdx, 32);
7626   orq(product, rdx);
7627   movq(carry, product);
7628 
7629   bind(L_post_third_loop_done);
7630 }
7631 
7632 /**
7633  * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
7634  *
7635  */
7636 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
7637                                                   Register carry, Register carry2,
7638                                                   Register idx, Register jdx,
7639                                                   Register yz_idx1, Register yz_idx2,
7640                                                   Register tmp, Register tmp3, Register tmp4) {
7641   assert(UseBMI2Instructions, "should be used only when BMI2 is available");
7642 
7643   //   jlong carry, x[], y[], z[];
7644   //   int kdx = ystart+1;
7645   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
7646   //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
7647   //     jlong carry2  = (jlong)(tmp3 >>> 64);
7648   //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
7649   //     carry  = (jlong)(tmp4 >>> 64);
7650   //     z[kdx+idx+1] = (jlong)tmp3;
7651   //     z[kdx+idx] = (jlong)tmp4;
7652   //   }
7653   //   idx += 2;
7654   //   if (idx > 0) {
7655   //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
7656   //     z[kdx+idx] = (jlong)yz_idx1;
7657   //     carry  = (jlong)(yz_idx1 >>> 64);
7658   //   }
7659   //
7660 
7661   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
7662 
7663   movl(jdx, idx);
7664   andl(jdx, 0xFFFFFFFC);
7665   shrl(jdx, 2);
7666 
7667   bind(L_third_loop);
7668   subl(jdx, 1);
7669   jcc(Assembler::negative, L_third_loop_exit);
7670   subl(idx, 4);
7671 
7672   movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
7673   rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
7674   movq(yz_idx2, Address(y, idx, Address::times_4,  0));
7675   rorxq(yz_idx2, yz_idx2, 32);
7676 
7677   mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
7678   mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
7679 
7680   movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
7681   rorxq(yz_idx1, yz_idx1, 32);
7682   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
7683   rorxq(yz_idx2, yz_idx2, 32);
7684 
7685   if (VM_Version::supports_adx()) {
7686     adcxq(tmp3, carry);
7687     adoxq(tmp3, yz_idx1);
7688 
7689     adcxq(tmp4, tmp);
7690     adoxq(tmp4, yz_idx2);
7691 
7692     movl(carry, 0); // does not affect flags
7693     adcxq(carry2, carry);
7694     adoxq(carry2, carry);
7695   } else {
7696     add2_with_carry(tmp4, tmp3, carry, yz_idx1);
7697     add2_with_carry(carry2, tmp4, tmp, yz_idx2);
7698   }
7699   movq(carry, carry2);
7700 
7701   movl(Address(z, idx, Address::times_4, 12), tmp3);
7702   shrq(tmp3, 32);
7703   movl(Address(z, idx, Address::times_4,  8), tmp3);
7704 
7705   movl(Address(z, idx, Address::times_4,  4), tmp4);
7706   shrq(tmp4, 32);
7707   movl(Address(z, idx, Address::times_4,  0), tmp4);
7708 
7709   jmp(L_third_loop);
7710 
7711   bind (L_third_loop_exit);
7712 
7713   andl (idx, 0x3);
7714   jcc(Assembler::zero, L_post_third_loop_done);
7715 
7716   Label L_check_1;
7717   subl(idx, 2);
7718   jcc(Assembler::negative, L_check_1);
7719 
7720   movq(yz_idx1, Address(y, idx, Address::times_4,  0));
7721   rorxq(yz_idx1, yz_idx1, 32);
7722   mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
7723   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
7724   rorxq(yz_idx2, yz_idx2, 32);
7725 
7726   add2_with_carry(tmp4, tmp3, carry, yz_idx2);
7727 
7728   movl(Address(z, idx, Address::times_4,  4), tmp3);
7729   shrq(tmp3, 32);
7730   movl(Address(z, idx, Address::times_4,  0), tmp3);
7731   movq(carry, tmp4);
7732 
7733   bind (L_check_1);
7734   addl (idx, 0x2);
7735   andl (idx, 0x1);
7736   subl(idx, 1);
7737   jcc(Assembler::negative, L_post_third_loop_done);
7738   movl(tmp4, Address(y, idx, Address::times_4,  0));
7739   mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
7740   movl(tmp4, Address(z, idx, Address::times_4,  0));
7741 
7742   add2_with_carry(carry2, tmp3, tmp4, carry);
7743 
7744   movl(Address(z, idx, Address::times_4,  0), tmp3);
7745   shrq(tmp3, 32);
7746 
7747   shlq(carry2, 32);
7748   orq(tmp3, carry2);
7749   movq(carry, tmp3);
7750 
7751   bind(L_post_third_loop_done);
7752 }
7753 
7754 /**
7755  * Code for BigInteger::multiplyToLen() instrinsic.
7756  *
7757  * rdi: x
7758  * rax: xlen
7759  * rsi: y
7760  * rcx: ylen
7761  * r8:  z
7762  * r11: zlen
7763  * r12: tmp1
7764  * r13: tmp2
7765  * r14: tmp3
7766  * r15: tmp4
7767  * rbx: tmp5
7768  *
7769  */
7770 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
7771                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
7772   ShortBranchVerifier sbv(this);
7773   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
7774 
7775   push(tmp1);
7776   push(tmp2);
7777   push(tmp3);
7778   push(tmp4);
7779   push(tmp5);
7780 
7781   push(xlen);
7782   push(zlen);
7783 
7784   const Register idx = tmp1;
7785   const Register kdx = tmp2;
7786   const Register xstart = tmp3;
7787 
7788   const Register y_idx = tmp4;
7789   const Register carry = tmp5;
7790   const Register product  = xlen;
7791   const Register x_xstart = zlen;  // reuse register
7792 
7793   // First Loop.
7794   //
7795   //  final static long LONG_MASK = 0xffffffffL;
7796   //  int xstart = xlen - 1;
7797   //  int ystart = ylen - 1;
7798   //  long carry = 0;
7799   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
7800   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
7801   //    z[kdx] = (int)product;
7802   //    carry = product >>> 32;
7803   //  }
7804   //  z[xstart] = (int)carry;
7805   //
7806 
7807   movl(idx, ylen);      // idx = ylen;
7808   movl(kdx, zlen);      // kdx = xlen+ylen;
7809   xorq(carry, carry);   // carry = 0;
7810 
7811   Label L_done;
7812 
7813   movl(xstart, xlen);
7814   decrementl(xstart);
7815   jcc(Assembler::negative, L_done);
7816 
7817   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
7818 
7819   Label L_second_loop;
7820   testl(kdx, kdx);
7821   jcc(Assembler::zero, L_second_loop);
7822 
7823   Label L_carry;
7824   subl(kdx, 1);
7825   jcc(Assembler::zero, L_carry);
7826 
7827   movl(Address(z, kdx, Address::times_4,  0), carry);
7828   shrq(carry, 32);
7829   subl(kdx, 1);
7830 
7831   bind(L_carry);
7832   movl(Address(z, kdx, Address::times_4,  0), carry);
7833 
7834   // Second and third (nested) loops.
7835   //
7836   // for (int i = xstart-1; i >= 0; i--) { // Second loop
7837   //   carry = 0;
7838   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
7839   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
7840   //                    (z[k] & LONG_MASK) + carry;
7841   //     z[k] = (int)product;
7842   //     carry = product >>> 32;
7843   //   }
7844   //   z[i] = (int)carry;
7845   // }
7846   //
7847   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
7848 
7849   const Register jdx = tmp1;
7850 
7851   bind(L_second_loop);
7852   xorl(carry, carry);    // carry = 0;
7853   movl(jdx, ylen);       // j = ystart+1
7854 
7855   subl(xstart, 1);       // i = xstart-1;
7856   jcc(Assembler::negative, L_done);
7857 
7858   push (z);
7859 
7860   Label L_last_x;
7861   lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
7862   subl(xstart, 1);       // i = xstart-1;
7863   jcc(Assembler::negative, L_last_x);
7864 
7865   if (UseBMI2Instructions) {
7866     movq(rdx,  Address(x, xstart, Address::times_4,  0));
7867     rorxq(rdx, rdx, 32); // convert big-endian to little-endian
7868   } else {
7869     movq(x_xstart, Address(x, xstart, Address::times_4,  0));
7870     rorq(x_xstart, 32);  // convert big-endian to little-endian
7871   }
7872 
7873   Label L_third_loop_prologue;
7874   bind(L_third_loop_prologue);
7875 
7876   push (x);
7877   push (xstart);
7878   push (ylen);
7879 
7880 
7881   if (UseBMI2Instructions) {
7882     multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
7883   } else { // !UseBMI2Instructions
7884     multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
7885   }
7886 
7887   pop(ylen);
7888   pop(xlen);
7889   pop(x);
7890   pop(z);
7891 
7892   movl(tmp3, xlen);
7893   addl(tmp3, 1);
7894   movl(Address(z, tmp3, Address::times_4,  0), carry);
7895   subl(tmp3, 1);
7896   jccb(Assembler::negative, L_done);
7897 
7898   shrq(carry, 32);
7899   movl(Address(z, tmp3, Address::times_4,  0), carry);
7900   jmp(L_second_loop);
7901 
7902   // Next infrequent code is moved outside loops.
7903   bind(L_last_x);
7904   if (UseBMI2Instructions) {
7905     movl(rdx, Address(x,  0));
7906   } else {
7907     movl(x_xstart, Address(x,  0));
7908   }
7909   jmp(L_third_loop_prologue);
7910 
7911   bind(L_done);
7912 
7913   pop(zlen);
7914   pop(xlen);
7915 
7916   pop(tmp5);
7917   pop(tmp4);
7918   pop(tmp3);
7919   pop(tmp2);
7920   pop(tmp1);
7921 }
7922 
7923 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
7924   Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
7925   assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
7926   Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
7927   Label VECTOR8_TAIL, VECTOR4_TAIL;
7928   Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
7929   Label SAME_TILL_END, DONE;
7930   Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
7931 
7932   //scale is in rcx in both Win64 and Unix
7933   ShortBranchVerifier sbv(this);
7934 
7935   shlq(length);
7936   xorq(result, result);
7937 
7938   if ((UseAVX > 2) &&
7939       VM_Version::supports_avx512vlbw()) {
7940     Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
7941 
7942     cmpq(length, 64);
7943     jcc(Assembler::less, VECTOR32_TAIL);
7944     movq(tmp1, length);
7945     andq(tmp1, 0x3F);      // tail count
7946     andq(length, ~(0x3F)); //vector count
7947 
7948     bind(VECTOR64_LOOP);
7949     // AVX512 code to compare 64 byte vectors.
7950     evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
7951     evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
7952     kortestql(k7, k7);
7953     jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL);     // mismatch
7954     addq(result, 64);
7955     subq(length, 64);
7956     jccb(Assembler::notZero, VECTOR64_LOOP);
7957 
7958     //bind(VECTOR64_TAIL);
7959     testq(tmp1, tmp1);
7960     jcc(Assembler::zero, SAME_TILL_END);
7961 
7962     //bind(VECTOR64_TAIL);
7963     // AVX512 code to compare upto 63 byte vectors.
7964     mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
7965     shlxq(tmp2, tmp2, tmp1);
7966     notq(tmp2);
7967     kmovql(k3, tmp2);
7968 
7969     evmovdqub(rymm0, k3, Address(obja, result), Assembler::AVX_512bit);
7970     evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
7971 
7972     ktestql(k7, k3);
7973     jcc(Assembler::below, SAME_TILL_END);     // not mismatch
7974 
7975     bind(VECTOR64_NOT_EQUAL);
7976     kmovql(tmp1, k7);
7977     notq(tmp1);
7978     tzcntq(tmp1, tmp1);
7979     addq(result, tmp1);
7980     shrq(result);
7981     jmp(DONE);
7982     bind(VECTOR32_TAIL);
7983   }
7984 
7985   cmpq(length, 8);
7986   jcc(Assembler::equal, VECTOR8_LOOP);
7987   jcc(Assembler::less, VECTOR4_TAIL);
7988 
7989   if (UseAVX >= 2) {
7990     Label VECTOR16_TAIL, VECTOR32_LOOP;
7991 
7992     cmpq(length, 16);
7993     jcc(Assembler::equal, VECTOR16_LOOP);
7994     jcc(Assembler::less, VECTOR8_LOOP);
7995 
7996     cmpq(length, 32);
7997     jccb(Assembler::less, VECTOR16_TAIL);
7998 
7999     subq(length, 32);
8000     bind(VECTOR32_LOOP);
8001     vmovdqu(rymm0, Address(obja, result));
8002     vmovdqu(rymm1, Address(objb, result));
8003     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
8004     vptest(rymm2, rymm2);
8005     jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
8006     addq(result, 32);
8007     subq(length, 32);
8008     jcc(Assembler::greaterEqual, VECTOR32_LOOP);
8009     addq(length, 32);
8010     jcc(Assembler::equal, SAME_TILL_END);
8011     //falling through if less than 32 bytes left //close the branch here.
8012 
8013     bind(VECTOR16_TAIL);
8014     cmpq(length, 16);
8015     jccb(Assembler::less, VECTOR8_TAIL);
8016     bind(VECTOR16_LOOP);
8017     movdqu(rymm0, Address(obja, result));
8018     movdqu(rymm1, Address(objb, result));
8019     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
8020     ptest(rymm2, rymm2);
8021     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
8022     addq(result, 16);
8023     subq(length, 16);
8024     jcc(Assembler::equal, SAME_TILL_END);
8025     //falling through if less than 16 bytes left
8026   } else {//regular intrinsics
8027 
8028     cmpq(length, 16);
8029     jccb(Assembler::less, VECTOR8_TAIL);
8030 
8031     subq(length, 16);
8032     bind(VECTOR16_LOOP);
8033     movdqu(rymm0, Address(obja, result));
8034     movdqu(rymm1, Address(objb, result));
8035     pxor(rymm0, rymm1);
8036     ptest(rymm0, rymm0);
8037     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
8038     addq(result, 16);
8039     subq(length, 16);
8040     jccb(Assembler::greaterEqual, VECTOR16_LOOP);
8041     addq(length, 16);
8042     jcc(Assembler::equal, SAME_TILL_END);
8043     //falling through if less than 16 bytes left
8044   }
8045 
8046   bind(VECTOR8_TAIL);
8047   cmpq(length, 8);
8048   jccb(Assembler::less, VECTOR4_TAIL);
8049   bind(VECTOR8_LOOP);
8050   movq(tmp1, Address(obja, result));
8051   movq(tmp2, Address(objb, result));
8052   xorq(tmp1, tmp2);
8053   testq(tmp1, tmp1);
8054   jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
8055   addq(result, 8);
8056   subq(length, 8);
8057   jcc(Assembler::equal, SAME_TILL_END);
8058   //falling through if less than 8 bytes left
8059 
8060   bind(VECTOR4_TAIL);
8061   cmpq(length, 4);
8062   jccb(Assembler::less, BYTES_TAIL);
8063   bind(VECTOR4_LOOP);
8064   movl(tmp1, Address(obja, result));
8065   xorl(tmp1, Address(objb, result));
8066   testl(tmp1, tmp1);
8067   jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
8068   addq(result, 4);
8069   subq(length, 4);
8070   jcc(Assembler::equal, SAME_TILL_END);
8071   //falling through if less than 4 bytes left
8072 
8073   bind(BYTES_TAIL);
8074   bind(BYTES_LOOP);
8075   load_unsigned_byte(tmp1, Address(obja, result));
8076   load_unsigned_byte(tmp2, Address(objb, result));
8077   xorl(tmp1, tmp2);
8078   testl(tmp1, tmp1);
8079   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
8080   decq(length);
8081   jcc(Assembler::zero, SAME_TILL_END);
8082   incq(result);
8083   load_unsigned_byte(tmp1, Address(obja, result));
8084   load_unsigned_byte(tmp2, Address(objb, result));
8085   xorl(tmp1, tmp2);
8086   testl(tmp1, tmp1);
8087   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
8088   decq(length);
8089   jcc(Assembler::zero, SAME_TILL_END);
8090   incq(result);
8091   load_unsigned_byte(tmp1, Address(obja, result));
8092   load_unsigned_byte(tmp2, Address(objb, result));
8093   xorl(tmp1, tmp2);
8094   testl(tmp1, tmp1);
8095   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
8096   jmp(SAME_TILL_END);
8097 
8098   if (UseAVX >= 2) {
8099     bind(VECTOR32_NOT_EQUAL);
8100     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
8101     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
8102     vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
8103     vpmovmskb(tmp1, rymm0);
8104     bsfq(tmp1, tmp1);
8105     addq(result, tmp1);
8106     shrq(result);
8107     jmp(DONE);
8108   }
8109 
8110   bind(VECTOR16_NOT_EQUAL);
8111   if (UseAVX >= 2) {
8112     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
8113     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
8114     pxor(rymm0, rymm2);
8115   } else {
8116     pcmpeqb(rymm2, rymm2);
8117     pxor(rymm0, rymm1);
8118     pcmpeqb(rymm0, rymm1);
8119     pxor(rymm0, rymm2);
8120   }
8121   pmovmskb(tmp1, rymm0);
8122   bsfq(tmp1, tmp1);
8123   addq(result, tmp1);
8124   shrq(result);
8125   jmpb(DONE);
8126 
8127   bind(VECTOR8_NOT_EQUAL);
8128   bind(VECTOR4_NOT_EQUAL);
8129   bsfq(tmp1, tmp1);
8130   shrq(tmp1, 3);
8131   addq(result, tmp1);
8132   bind(BYTES_NOT_EQUAL);
8133   shrq(result);
8134   jmpb(DONE);
8135 
8136   bind(SAME_TILL_END);
8137   mov64(result, -1);
8138 
8139   bind(DONE);
8140 }
8141 
8142 //Helper functions for square_to_len()
8143 
8144 /**
8145  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
8146  * Preserves x and z and modifies rest of the registers.
8147  */
8148 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8149   // Perform square and right shift by 1
8150   // Handle odd xlen case first, then for even xlen do the following
8151   // jlong carry = 0;
8152   // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
8153   //     huge_128 product = x[j:j+1] * x[j:j+1];
8154   //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
8155   //     z[i+2:i+3] = (jlong)(product >>> 1);
8156   //     carry = (jlong)product;
8157   // }
8158 
8159   xorq(tmp5, tmp5);     // carry
8160   xorq(rdxReg, rdxReg);
8161   xorl(tmp1, tmp1);     // index for x
8162   xorl(tmp4, tmp4);     // index for z
8163 
8164   Label L_first_loop, L_first_loop_exit;
8165 
8166   testl(xlen, 1);
8167   jccb(Assembler::zero, L_first_loop); //jump if xlen is even
8168 
8169   // Square and right shift by 1 the odd element using 32 bit multiply
8170   movl(raxReg, Address(x, tmp1, Address::times_4, 0));
8171   imulq(raxReg, raxReg);
8172   shrq(raxReg, 1);
8173   adcq(tmp5, 0);
8174   movq(Address(z, tmp4, Address::times_4, 0), raxReg);
8175   incrementl(tmp1);
8176   addl(tmp4, 2);
8177 
8178   // Square and  right shift by 1 the rest using 64 bit multiply
8179   bind(L_first_loop);
8180   cmpptr(tmp1, xlen);
8181   jccb(Assembler::equal, L_first_loop_exit);
8182 
8183   // Square
8184   movq(raxReg, Address(x, tmp1, Address::times_4,  0));
8185   rorq(raxReg, 32);    // convert big-endian to little-endian
8186   mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
8187 
8188   // Right shift by 1 and save carry
8189   shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
8190   rcrq(rdxReg, 1);
8191   rcrq(raxReg, 1);
8192   adcq(tmp5, 0);
8193 
8194   // Store result in z
8195   movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
8196   movq(Address(z, tmp4, Address::times_4, 8), raxReg);
8197 
8198   // Update indices for x and z
8199   addl(tmp1, 2);
8200   addl(tmp4, 4);
8201   jmp(L_first_loop);
8202 
8203   bind(L_first_loop_exit);
8204 }
8205 
8206 
8207 /**
8208  * Perform the following multiply add operation using BMI2 instructions
8209  * carry:sum = sum + op1*op2 + carry
8210  * op2 should be in rdx
8211  * op2 is preserved, all other registers are modified
8212  */
8213 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
8214   // assert op2 is rdx
8215   mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
8216   addq(sum, carry);
8217   adcq(tmp2, 0);
8218   addq(sum, op1);
8219   adcq(tmp2, 0);
8220   movq(carry, tmp2);
8221 }
8222 
8223 /**
8224  * Perform the following multiply add operation:
8225  * carry:sum = sum + op1*op2 + carry
8226  * Preserves op1, op2 and modifies rest of registers
8227  */
8228 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
8229   // rdx:rax = op1 * op2
8230   movq(raxReg, op2);
8231   mulq(op1);
8232 
8233   //  rdx:rax = sum + carry + rdx:rax
8234   addq(sum, carry);
8235   adcq(rdxReg, 0);
8236   addq(sum, raxReg);
8237   adcq(rdxReg, 0);
8238 
8239   // carry:sum = rdx:sum
8240   movq(carry, rdxReg);
8241 }
8242 
8243 /**
8244  * Add 64 bit long carry into z[] with carry propogation.
8245  * Preserves z and carry register values and modifies rest of registers.
8246  *
8247  */
8248 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
8249   Label L_fourth_loop, L_fourth_loop_exit;
8250 
8251   movl(tmp1, 1);
8252   subl(zlen, 2);
8253   addq(Address(z, zlen, Address::times_4, 0), carry);
8254 
8255   bind(L_fourth_loop);
8256   jccb(Assembler::carryClear, L_fourth_loop_exit);
8257   subl(zlen, 2);
8258   jccb(Assembler::negative, L_fourth_loop_exit);
8259   addq(Address(z, zlen, Address::times_4, 0), tmp1);
8260   jmp(L_fourth_loop);
8261   bind(L_fourth_loop_exit);
8262 }
8263 
8264 /**
8265  * Shift z[] left by 1 bit.
8266  * Preserves x, len, z and zlen registers and modifies rest of the registers.
8267  *
8268  */
8269 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
8270 
8271   Label L_fifth_loop, L_fifth_loop_exit;
8272 
8273   // Fifth loop
8274   // Perform primitiveLeftShift(z, zlen, 1)
8275 
8276   const Register prev_carry = tmp1;
8277   const Register new_carry = tmp4;
8278   const Register value = tmp2;
8279   const Register zidx = tmp3;
8280 
8281   // int zidx, carry;
8282   // long value;
8283   // carry = 0;
8284   // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
8285   //    (carry:value)  = (z[i] << 1) | carry ;
8286   //    z[i] = value;
8287   // }
8288 
8289   movl(zidx, zlen);
8290   xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
8291 
8292   bind(L_fifth_loop);
8293   decl(zidx);  // Use decl to preserve carry flag
8294   decl(zidx);
8295   jccb(Assembler::negative, L_fifth_loop_exit);
8296 
8297   if (UseBMI2Instructions) {
8298      movq(value, Address(z, zidx, Address::times_4, 0));
8299      rclq(value, 1);
8300      rorxq(value, value, 32);
8301      movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
8302   }
8303   else {
8304     // clear new_carry
8305     xorl(new_carry, new_carry);
8306 
8307     // Shift z[i] by 1, or in previous carry and save new carry
8308     movq(value, Address(z, zidx, Address::times_4, 0));
8309     shlq(value, 1);
8310     adcl(new_carry, 0);
8311 
8312     orq(value, prev_carry);
8313     rorq(value, 0x20);
8314     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
8315 
8316     // Set previous carry = new carry
8317     movl(prev_carry, new_carry);
8318   }
8319   jmp(L_fifth_loop);
8320 
8321   bind(L_fifth_loop_exit);
8322 }
8323 
8324 
8325 /**
8326  * Code for BigInteger::squareToLen() intrinsic
8327  *
8328  * rdi: x
8329  * rsi: len
8330  * r8:  z
8331  * rcx: zlen
8332  * r12: tmp1
8333  * r13: tmp2
8334  * r14: tmp3
8335  * r15: tmp4
8336  * rbx: tmp5
8337  *
8338  */
8339 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8340 
8341   Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
8342   push(tmp1);
8343   push(tmp2);
8344   push(tmp3);
8345   push(tmp4);
8346   push(tmp5);
8347 
8348   // First loop
8349   // Store the squares, right shifted one bit (i.e., divided by 2).
8350   square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
8351 
8352   // Add in off-diagonal sums.
8353   //
8354   // Second, third (nested) and fourth loops.
8355   // zlen +=2;
8356   // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
8357   //    carry = 0;
8358   //    long op2 = x[xidx:xidx+1];
8359   //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
8360   //       k -= 2;
8361   //       long op1 = x[j:j+1];
8362   //       long sum = z[k:k+1];
8363   //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
8364   //       z[k:k+1] = sum;
8365   //    }
8366   //    add_one_64(z, k, carry, tmp_regs);
8367   // }
8368 
8369   const Register carry = tmp5;
8370   const Register sum = tmp3;
8371   const Register op1 = tmp4;
8372   Register op2 = tmp2;
8373 
8374   push(zlen);
8375   push(len);
8376   addl(zlen,2);
8377   bind(L_second_loop);
8378   xorq(carry, carry);
8379   subl(zlen, 4);
8380   subl(len, 2);
8381   push(zlen);
8382   push(len);
8383   cmpl(len, 0);
8384   jccb(Assembler::lessEqual, L_second_loop_exit);
8385 
8386   // Multiply an array by one 64 bit long.
8387   if (UseBMI2Instructions) {
8388     op2 = rdxReg;
8389     movq(op2, Address(x, len, Address::times_4,  0));
8390     rorxq(op2, op2, 32);
8391   }
8392   else {
8393     movq(op2, Address(x, len, Address::times_4,  0));
8394     rorq(op2, 32);
8395   }
8396 
8397   bind(L_third_loop);
8398   decrementl(len);
8399   jccb(Assembler::negative, L_third_loop_exit);
8400   decrementl(len);
8401   jccb(Assembler::negative, L_last_x);
8402 
8403   movq(op1, Address(x, len, Address::times_4,  0));
8404   rorq(op1, 32);
8405 
8406   bind(L_multiply);
8407   subl(zlen, 2);
8408   movq(sum, Address(z, zlen, Address::times_4,  0));
8409 
8410   // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
8411   if (UseBMI2Instructions) {
8412     multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
8413   }
8414   else {
8415     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8416   }
8417 
8418   movq(Address(z, zlen, Address::times_4, 0), sum);
8419 
8420   jmp(L_third_loop);
8421   bind(L_third_loop_exit);
8422 
8423   // Fourth loop
8424   // Add 64 bit long carry into z with carry propogation.
8425   // Uses offsetted zlen.
8426   add_one_64(z, zlen, carry, tmp1);
8427 
8428   pop(len);
8429   pop(zlen);
8430   jmp(L_second_loop);
8431 
8432   // Next infrequent code is moved outside loops.
8433   bind(L_last_x);
8434   movl(op1, Address(x, 0));
8435   jmp(L_multiply);
8436 
8437   bind(L_second_loop_exit);
8438   pop(len);
8439   pop(zlen);
8440   pop(len);
8441   pop(zlen);
8442 
8443   // Fifth loop
8444   // Shift z left 1 bit.
8445   lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
8446 
8447   // z[zlen-1] |= x[len-1] & 1;
8448   movl(tmp3, Address(x, len, Address::times_4, -4));
8449   andl(tmp3, 1);
8450   orl(Address(z, zlen, Address::times_4,  -4), tmp3);
8451 
8452   pop(tmp5);
8453   pop(tmp4);
8454   pop(tmp3);
8455   pop(tmp2);
8456   pop(tmp1);
8457 }
8458 
8459 /**
8460  * Helper function for mul_add()
8461  * Multiply the in[] by int k and add to out[] starting at offset offs using
8462  * 128 bit by 32 bit multiply and return the carry in tmp5.
8463  * Only quad int aligned length of in[] is operated on in this function.
8464  * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
8465  * This function preserves out, in and k registers.
8466  * len and offset point to the appropriate index in "in" & "out" correspondingly
8467  * tmp5 has the carry.
8468  * other registers are temporary and are modified.
8469  *
8470  */
8471 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
8472   Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
8473   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8474 
8475   Label L_first_loop, L_first_loop_exit;
8476 
8477   movl(tmp1, len);
8478   shrl(tmp1, 2);
8479 
8480   bind(L_first_loop);
8481   subl(tmp1, 1);
8482   jccb(Assembler::negative, L_first_loop_exit);
8483 
8484   subl(len, 4);
8485   subl(offset, 4);
8486 
8487   Register op2 = tmp2;
8488   const Register sum = tmp3;
8489   const Register op1 = tmp4;
8490   const Register carry = tmp5;
8491 
8492   if (UseBMI2Instructions) {
8493     op2 = rdxReg;
8494   }
8495 
8496   movq(op1, Address(in, len, Address::times_4,  8));
8497   rorq(op1, 32);
8498   movq(sum, Address(out, offset, Address::times_4,  8));
8499   rorq(sum, 32);
8500   if (UseBMI2Instructions) {
8501     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8502   }
8503   else {
8504     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8505   }
8506   // Store back in big endian from little endian
8507   rorq(sum, 0x20);
8508   movq(Address(out, offset, Address::times_4,  8), sum);
8509 
8510   movq(op1, Address(in, len, Address::times_4,  0));
8511   rorq(op1, 32);
8512   movq(sum, Address(out, offset, Address::times_4,  0));
8513   rorq(sum, 32);
8514   if (UseBMI2Instructions) {
8515     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8516   }
8517   else {
8518     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8519   }
8520   // Store back in big endian from little endian
8521   rorq(sum, 0x20);
8522   movq(Address(out, offset, Address::times_4,  0), sum);
8523 
8524   jmp(L_first_loop);
8525   bind(L_first_loop_exit);
8526 }
8527 
8528 /**
8529  * Code for BigInteger::mulAdd() intrinsic
8530  *
8531  * rdi: out
8532  * rsi: in
8533  * r11: offs (out.length - offset)
8534  * rcx: len
8535  * r8:  k
8536  * r12: tmp1
8537  * r13: tmp2
8538  * r14: tmp3
8539  * r15: tmp4
8540  * rbx: tmp5
8541  * Multiply the in[] by word k and add to out[], return the carry in rax
8542  */
8543 void MacroAssembler::mul_add(Register out, Register in, Register offs,
8544    Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
8545    Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
8546 
8547   Label L_carry, L_last_in, L_done;
8548 
8549 // carry = 0;
8550 // for (int j=len-1; j >= 0; j--) {
8551 //    long product = (in[j] & LONG_MASK) * kLong +
8552 //                   (out[offs] & LONG_MASK) + carry;
8553 //    out[offs--] = (int)product;
8554 //    carry = product >>> 32;
8555 // }
8556 //
8557   push(tmp1);
8558   push(tmp2);
8559   push(tmp3);
8560   push(tmp4);
8561   push(tmp5);
8562 
8563   Register op2 = tmp2;
8564   const Register sum = tmp3;
8565   const Register op1 = tmp4;
8566   const Register carry =  tmp5;
8567 
8568   if (UseBMI2Instructions) {
8569     op2 = rdxReg;
8570     movl(op2, k);
8571   }
8572   else {
8573     movl(op2, k);
8574   }
8575 
8576   xorq(carry, carry);
8577 
8578   //First loop
8579 
8580   //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
8581   //The carry is in tmp5
8582   mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
8583 
8584   //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
8585   decrementl(len);
8586   jccb(Assembler::negative, L_carry);
8587   decrementl(len);
8588   jccb(Assembler::negative, L_last_in);
8589 
8590   movq(op1, Address(in, len, Address::times_4,  0));
8591   rorq(op1, 32);
8592 
8593   subl(offs, 2);
8594   movq(sum, Address(out, offs, Address::times_4,  0));
8595   rorq(sum, 32);
8596 
8597   if (UseBMI2Instructions) {
8598     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
8599   }
8600   else {
8601     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
8602   }
8603 
8604   // Store back in big endian from little endian
8605   rorq(sum, 0x20);
8606   movq(Address(out, offs, Address::times_4,  0), sum);
8607 
8608   testl(len, len);
8609   jccb(Assembler::zero, L_carry);
8610 
8611   //Multiply the last in[] entry, if any
8612   bind(L_last_in);
8613   movl(op1, Address(in, 0));
8614   movl(sum, Address(out, offs, Address::times_4,  -4));
8615 
8616   movl(raxReg, k);
8617   mull(op1); //tmp4 * eax -> edx:eax
8618   addl(sum, carry);
8619   adcl(rdxReg, 0);
8620   addl(sum, raxReg);
8621   adcl(rdxReg, 0);
8622   movl(carry, rdxReg);
8623 
8624   movl(Address(out, offs, Address::times_4,  -4), sum);
8625 
8626   bind(L_carry);
8627   //return tmp5/carry as carry in rax
8628   movl(rax, carry);
8629 
8630   bind(L_done);
8631   pop(tmp5);
8632   pop(tmp4);
8633   pop(tmp3);
8634   pop(tmp2);
8635   pop(tmp1);
8636 }
8637 #endif
8638 
8639 /**
8640  * Emits code to update CRC-32 with a byte value according to constants in table
8641  *
8642  * @param [in,out]crc   Register containing the crc.
8643  * @param [in]val       Register containing the byte to fold into the CRC.
8644  * @param [in]table     Register containing the table of crc constants.
8645  *
8646  * uint32_t crc;
8647  * val = crc_table[(val ^ crc) & 0xFF];
8648  * crc = val ^ (crc >> 8);
8649  *
8650  */
8651 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
8652   xorl(val, crc);
8653   andl(val, 0xFF);
8654   shrl(crc, 8); // unsigned shift
8655   xorl(crc, Address(table, val, Address::times_4, 0));
8656 }
8657 
8658 /**
8659 * Fold four 128-bit data chunks
8660 */
8661 void MacroAssembler::fold_128bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
8662   evpclmulhdq(xtmp, xK, xcrc, Assembler::AVX_512bit); // [123:64]
8663   evpclmulldq(xcrc, xK, xcrc, Assembler::AVX_512bit); // [63:0]
8664   evpxorq(xcrc, xcrc, Address(buf, offset), Assembler::AVX_512bit /* vector_len */);
8665   evpxorq(xcrc, xcrc, xtmp, Assembler::AVX_512bit /* vector_len */);
8666 }
8667 
8668 /**
8669  * Fold 128-bit data chunk
8670  */
8671 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
8672   if (UseAVX > 0) {
8673     vpclmulhdq(xtmp, xK, xcrc); // [123:64]
8674     vpclmulldq(xcrc, xK, xcrc); // [63:0]
8675     vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
8676     pxor(xcrc, xtmp);
8677   } else {
8678     movdqa(xtmp, xcrc);
8679     pclmulhdq(xtmp, xK);   // [123:64]
8680     pclmulldq(xcrc, xK);   // [63:0]
8681     pxor(xcrc, xtmp);
8682     movdqu(xtmp, Address(buf, offset));
8683     pxor(xcrc, xtmp);
8684   }
8685 }
8686 
8687 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
8688   if (UseAVX > 0) {
8689     vpclmulhdq(xtmp, xK, xcrc);
8690     vpclmulldq(xcrc, xK, xcrc);
8691     pxor(xcrc, xbuf);
8692     pxor(xcrc, xtmp);
8693   } else {
8694     movdqa(xtmp, xcrc);
8695     pclmulhdq(xtmp, xK);
8696     pclmulldq(xcrc, xK);
8697     pxor(xcrc, xbuf);
8698     pxor(xcrc, xtmp);
8699   }
8700 }
8701 
8702 /**
8703  * 8-bit folds to compute 32-bit CRC
8704  *
8705  * uint64_t xcrc;
8706  * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
8707  */
8708 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
8709   movdl(tmp, xcrc);
8710   andl(tmp, 0xFF);
8711   movdl(xtmp, Address(table, tmp, Address::times_4, 0));
8712   psrldq(xcrc, 1); // unsigned shift one byte
8713   pxor(xcrc, xtmp);
8714 }
8715 
8716 /**
8717  * uint32_t crc;
8718  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
8719  */
8720 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
8721   movl(tmp, crc);
8722   andl(tmp, 0xFF);
8723   shrl(crc, 8);
8724   xorl(crc, Address(table, tmp, Address::times_4, 0));
8725 }
8726 
8727 /**
8728  * @param crc   register containing existing CRC (32-bit)
8729  * @param buf   register pointing to input byte buffer (byte*)
8730  * @param len   register containing number of bytes
8731  * @param table register that will contain address of CRC table
8732  * @param tmp   scratch register
8733  */
8734 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
8735   assert_different_registers(crc, buf, len, table, tmp, rax);
8736 
8737   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
8738   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
8739 
8740   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
8741   // context for the registers used, where all instructions below are using 128-bit mode
8742   // On EVEX without VL and BW, these instructions will all be AVX.
8743   lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
8744   notl(crc); // ~crc
8745   cmpl(len, 16);
8746   jcc(Assembler::less, L_tail);
8747 
8748   // Align buffer to 16 bytes
8749   movl(tmp, buf);
8750   andl(tmp, 0xF);
8751   jccb(Assembler::zero, L_aligned);
8752   subl(tmp,  16);
8753   addl(len, tmp);
8754 
8755   align(4);
8756   BIND(L_align_loop);
8757   movsbl(rax, Address(buf, 0)); // load byte with sign extension
8758   update_byte_crc32(crc, rax, table);
8759   increment(buf);
8760   incrementl(tmp);
8761   jccb(Assembler::less, L_align_loop);
8762 
8763   BIND(L_aligned);
8764   movl(tmp, len); // save
8765   shrl(len, 4);
8766   jcc(Assembler::zero, L_tail_restore);
8767 
8768   // Fold total 512 bits of polynomial on each iteration
8769   if (VM_Version::supports_vpclmulqdq()) {
8770     Label Parallel_loop, L_No_Parallel;
8771 
8772     cmpl(len, 8);
8773     jccb(Assembler::less, L_No_Parallel);
8774 
8775     movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
8776     evmovdquq(xmm1, Address(buf, 0), Assembler::AVX_512bit);
8777     movdl(xmm5, crc);
8778     evpxorq(xmm1, xmm1, xmm5, Assembler::AVX_512bit);
8779     addptr(buf, 64);
8780     subl(len, 7);
8781     evshufi64x2(xmm0, xmm0, xmm0, 0x00, Assembler::AVX_512bit); //propagate the mask from 128 bits to 512 bits
8782 
8783     BIND(Parallel_loop);
8784     fold_128bit_crc32_avx512(xmm1, xmm0, xmm5, buf, 0);
8785     addptr(buf, 64);
8786     subl(len, 4);
8787     jcc(Assembler::greater, Parallel_loop);
8788 
8789     vextracti64x2(xmm2, xmm1, 0x01);
8790     vextracti64x2(xmm3, xmm1, 0x02);
8791     vextracti64x2(xmm4, xmm1, 0x03);
8792     jmp(L_fold_512b);
8793 
8794     BIND(L_No_Parallel);
8795   }
8796   // Fold crc into first bytes of vector
8797   movdqa(xmm1, Address(buf, 0));
8798   movdl(rax, xmm1);
8799   xorl(crc, rax);
8800   if (VM_Version::supports_sse4_1()) {
8801     pinsrd(xmm1, crc, 0);
8802   } else {
8803     pinsrw(xmm1, crc, 0);
8804     shrl(crc, 16);
8805     pinsrw(xmm1, crc, 1);
8806   }
8807   addptr(buf, 16);
8808   subl(len, 4); // len > 0
8809   jcc(Assembler::less, L_fold_tail);
8810 
8811   movdqa(xmm2, Address(buf,  0));
8812   movdqa(xmm3, Address(buf, 16));
8813   movdqa(xmm4, Address(buf, 32));
8814   addptr(buf, 48);
8815   subl(len, 3);
8816   jcc(Assembler::lessEqual, L_fold_512b);
8817 
8818   // Fold total 512 bits of polynomial on each iteration,
8819   // 128 bits per each of 4 parallel streams.
8820   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
8821 
8822   align(32);
8823   BIND(L_fold_512b_loop);
8824   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
8825   fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
8826   fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
8827   fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
8828   addptr(buf, 64);
8829   subl(len, 4);
8830   jcc(Assembler::greater, L_fold_512b_loop);
8831 
8832   // Fold 512 bits to 128 bits.
8833   BIND(L_fold_512b);
8834   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
8835   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
8836   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
8837   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
8838 
8839   // Fold the rest of 128 bits data chunks
8840   BIND(L_fold_tail);
8841   addl(len, 3);
8842   jccb(Assembler::lessEqual, L_fold_128b);
8843   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
8844 
8845   BIND(L_fold_tail_loop);
8846   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
8847   addptr(buf, 16);
8848   decrementl(len);
8849   jccb(Assembler::greater, L_fold_tail_loop);
8850 
8851   // Fold 128 bits in xmm1 down into 32 bits in crc register.
8852   BIND(L_fold_128b);
8853   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
8854   if (UseAVX > 0) {
8855     vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
8856     vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
8857     vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
8858   } else {
8859     movdqa(xmm2, xmm0);
8860     pclmulqdq(xmm2, xmm1, 0x1);
8861     movdqa(xmm3, xmm0);
8862     pand(xmm3, xmm2);
8863     pclmulqdq(xmm0, xmm3, 0x1);
8864   }
8865   psrldq(xmm1, 8);
8866   psrldq(xmm2, 4);
8867   pxor(xmm0, xmm1);
8868   pxor(xmm0, xmm2);
8869 
8870   // 8 8-bit folds to compute 32-bit CRC.
8871   for (int j = 0; j < 4; j++) {
8872     fold_8bit_crc32(xmm0, table, xmm1, rax);
8873   }
8874   movdl(crc, xmm0); // mov 32 bits to general register
8875   for (int j = 0; j < 4; j++) {
8876     fold_8bit_crc32(crc, table, rax);
8877   }
8878 
8879   BIND(L_tail_restore);
8880   movl(len, tmp); // restore
8881   BIND(L_tail);
8882   andl(len, 0xf);
8883   jccb(Assembler::zero, L_exit);
8884 
8885   // Fold the rest of bytes
8886   align(4);
8887   BIND(L_tail_loop);
8888   movsbl(rax, Address(buf, 0)); // load byte with sign extension
8889   update_byte_crc32(crc, rax, table);
8890   increment(buf);
8891   decrementl(len);
8892   jccb(Assembler::greater, L_tail_loop);
8893 
8894   BIND(L_exit);
8895   notl(crc); // ~c
8896 }
8897 
8898 #ifdef _LP64
8899 // S. Gueron / Information Processing Letters 112 (2012) 184
8900 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
8901 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
8902 // Output: the 64-bit carry-less product of B * CONST
8903 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
8904                                      Register tmp1, Register tmp2, Register tmp3) {
8905   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
8906   if (n > 0) {
8907     addq(tmp3, n * 256 * 8);
8908   }
8909   //    Q1 = TABLEExt[n][B & 0xFF];
8910   movl(tmp1, in);
8911   andl(tmp1, 0x000000FF);
8912   shll(tmp1, 3);
8913   addq(tmp1, tmp3);
8914   movq(tmp1, Address(tmp1, 0));
8915 
8916   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
8917   movl(tmp2, in);
8918   shrl(tmp2, 8);
8919   andl(tmp2, 0x000000FF);
8920   shll(tmp2, 3);
8921   addq(tmp2, tmp3);
8922   movq(tmp2, Address(tmp2, 0));
8923 
8924   shlq(tmp2, 8);
8925   xorq(tmp1, tmp2);
8926 
8927   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
8928   movl(tmp2, in);
8929   shrl(tmp2, 16);
8930   andl(tmp2, 0x000000FF);
8931   shll(tmp2, 3);
8932   addq(tmp2, tmp3);
8933   movq(tmp2, Address(tmp2, 0));
8934 
8935   shlq(tmp2, 16);
8936   xorq(tmp1, tmp2);
8937 
8938   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
8939   shrl(in, 24);
8940   andl(in, 0x000000FF);
8941   shll(in, 3);
8942   addq(in, tmp3);
8943   movq(in, Address(in, 0));
8944 
8945   shlq(in, 24);
8946   xorq(in, tmp1);
8947   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
8948 }
8949 
8950 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
8951                                       Register in_out,
8952                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
8953                                       XMMRegister w_xtmp2,
8954                                       Register tmp1,
8955                                       Register n_tmp2, Register n_tmp3) {
8956   if (is_pclmulqdq_supported) {
8957     movdl(w_xtmp1, in_out); // modified blindly
8958 
8959     movl(tmp1, const_or_pre_comp_const_index);
8960     movdl(w_xtmp2, tmp1);
8961     pclmulqdq(w_xtmp1, w_xtmp2, 0);
8962 
8963     movdq(in_out, w_xtmp1);
8964   } else {
8965     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
8966   }
8967 }
8968 
8969 // Recombination Alternative 2: No bit-reflections
8970 // T1 = (CRC_A * U1) << 1
8971 // T2 = (CRC_B * U2) << 1
8972 // C1 = T1 >> 32
8973 // C2 = T2 >> 32
8974 // T1 = T1 & 0xFFFFFFFF
8975 // T2 = T2 & 0xFFFFFFFF
8976 // T1 = CRC32(0, T1)
8977 // T2 = CRC32(0, T2)
8978 // C1 = C1 ^ T1
8979 // C2 = C2 ^ T2
8980 // CRC = C1 ^ C2 ^ CRC_C
8981 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
8982                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
8983                                      Register tmp1, Register tmp2,
8984                                      Register n_tmp3) {
8985   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8986   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
8987   shlq(in_out, 1);
8988   movl(tmp1, in_out);
8989   shrq(in_out, 32);
8990   xorl(tmp2, tmp2);
8991   crc32(tmp2, tmp1, 4);
8992   xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
8993   shlq(in1, 1);
8994   movl(tmp1, in1);
8995   shrq(in1, 32);
8996   xorl(tmp2, tmp2);
8997   crc32(tmp2, tmp1, 4);
8998   xorl(in1, tmp2);
8999   xorl(in_out, in1);
9000   xorl(in_out, in2);
9001 }
9002 
9003 // Set N to predefined value
9004 // Subtract from a lenght of a buffer
9005 // execute in a loop:
9006 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
9007 // for i = 1 to N do
9008 //  CRC_A = CRC32(CRC_A, A[i])
9009 //  CRC_B = CRC32(CRC_B, B[i])
9010 //  CRC_C = CRC32(CRC_C, C[i])
9011 // end for
9012 // Recombine
9013 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
9014                                        Register in_out1, Register in_out2, Register in_out3,
9015                                        Register tmp1, Register tmp2, Register tmp3,
9016                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9017                                        Register tmp4, Register tmp5,
9018                                        Register n_tmp6) {
9019   Label L_processPartitions;
9020   Label L_processPartition;
9021   Label L_exit;
9022 
9023   bind(L_processPartitions);
9024   cmpl(in_out1, 3 * size);
9025   jcc(Assembler::less, L_exit);
9026     xorl(tmp1, tmp1);
9027     xorl(tmp2, tmp2);
9028     movq(tmp3, in_out2);
9029     addq(tmp3, size);
9030 
9031     bind(L_processPartition);
9032       crc32(in_out3, Address(in_out2, 0), 8);
9033       crc32(tmp1, Address(in_out2, size), 8);
9034       crc32(tmp2, Address(in_out2, size * 2), 8);
9035       addq(in_out2, 8);
9036       cmpq(in_out2, tmp3);
9037       jcc(Assembler::less, L_processPartition);
9038     crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
9039             w_xtmp1, w_xtmp2, w_xtmp3,
9040             tmp4, tmp5,
9041             n_tmp6);
9042     addq(in_out2, 2 * size);
9043     subl(in_out1, 3 * size);
9044     jmp(L_processPartitions);
9045 
9046   bind(L_exit);
9047 }
9048 #else
9049 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
9050                                      Register tmp1, Register tmp2, Register tmp3,
9051                                      XMMRegister xtmp1, XMMRegister xtmp2) {
9052   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
9053   if (n > 0) {
9054     addl(tmp3, n * 256 * 8);
9055   }
9056   //    Q1 = TABLEExt[n][B & 0xFF];
9057   movl(tmp1, in_out);
9058   andl(tmp1, 0x000000FF);
9059   shll(tmp1, 3);
9060   addl(tmp1, tmp3);
9061   movq(xtmp1, Address(tmp1, 0));
9062 
9063   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
9064   movl(tmp2, in_out);
9065   shrl(tmp2, 8);
9066   andl(tmp2, 0x000000FF);
9067   shll(tmp2, 3);
9068   addl(tmp2, tmp3);
9069   movq(xtmp2, Address(tmp2, 0));
9070 
9071   psllq(xtmp2, 8);
9072   pxor(xtmp1, xtmp2);
9073 
9074   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
9075   movl(tmp2, in_out);
9076   shrl(tmp2, 16);
9077   andl(tmp2, 0x000000FF);
9078   shll(tmp2, 3);
9079   addl(tmp2, tmp3);
9080   movq(xtmp2, Address(tmp2, 0));
9081 
9082   psllq(xtmp2, 16);
9083   pxor(xtmp1, xtmp2);
9084 
9085   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
9086   shrl(in_out, 24);
9087   andl(in_out, 0x000000FF);
9088   shll(in_out, 3);
9089   addl(in_out, tmp3);
9090   movq(xtmp2, Address(in_out, 0));
9091 
9092   psllq(xtmp2, 24);
9093   pxor(xtmp1, xtmp2); // Result in CXMM
9094   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
9095 }
9096 
9097 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
9098                                       Register in_out,
9099                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
9100                                       XMMRegister w_xtmp2,
9101                                       Register tmp1,
9102                                       Register n_tmp2, Register n_tmp3) {
9103   if (is_pclmulqdq_supported) {
9104     movdl(w_xtmp1, in_out);
9105 
9106     movl(tmp1, const_or_pre_comp_const_index);
9107     movdl(w_xtmp2, tmp1);
9108     pclmulqdq(w_xtmp1, w_xtmp2, 0);
9109     // Keep result in XMM since GPR is 32 bit in length
9110   } else {
9111     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
9112   }
9113 }
9114 
9115 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
9116                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9117                                      Register tmp1, Register tmp2,
9118                                      Register n_tmp3) {
9119   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9120   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9121 
9122   psllq(w_xtmp1, 1);
9123   movdl(tmp1, w_xtmp1);
9124   psrlq(w_xtmp1, 32);
9125   movdl(in_out, w_xtmp1);
9126 
9127   xorl(tmp2, tmp2);
9128   crc32(tmp2, tmp1, 4);
9129   xorl(in_out, tmp2);
9130 
9131   psllq(w_xtmp2, 1);
9132   movdl(tmp1, w_xtmp2);
9133   psrlq(w_xtmp2, 32);
9134   movdl(in1, w_xtmp2);
9135 
9136   xorl(tmp2, tmp2);
9137   crc32(tmp2, tmp1, 4);
9138   xorl(in1, tmp2);
9139   xorl(in_out, in1);
9140   xorl(in_out, in2);
9141 }
9142 
9143 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
9144                                        Register in_out1, Register in_out2, Register in_out3,
9145                                        Register tmp1, Register tmp2, Register tmp3,
9146                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9147                                        Register tmp4, Register tmp5,
9148                                        Register n_tmp6) {
9149   Label L_processPartitions;
9150   Label L_processPartition;
9151   Label L_exit;
9152 
9153   bind(L_processPartitions);
9154   cmpl(in_out1, 3 * size);
9155   jcc(Assembler::less, L_exit);
9156     xorl(tmp1, tmp1);
9157     xorl(tmp2, tmp2);
9158     movl(tmp3, in_out2);
9159     addl(tmp3, size);
9160 
9161     bind(L_processPartition);
9162       crc32(in_out3, Address(in_out2, 0), 4);
9163       crc32(tmp1, Address(in_out2, size), 4);
9164       crc32(tmp2, Address(in_out2, size*2), 4);
9165       crc32(in_out3, Address(in_out2, 0+4), 4);
9166       crc32(tmp1, Address(in_out2, size+4), 4);
9167       crc32(tmp2, Address(in_out2, size*2+4), 4);
9168       addl(in_out2, 8);
9169       cmpl(in_out2, tmp3);
9170       jcc(Assembler::less, L_processPartition);
9171 
9172         push(tmp3);
9173         push(in_out1);
9174         push(in_out2);
9175         tmp4 = tmp3;
9176         tmp5 = in_out1;
9177         n_tmp6 = in_out2;
9178 
9179       crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
9180             w_xtmp1, w_xtmp2, w_xtmp3,
9181             tmp4, tmp5,
9182             n_tmp6);
9183 
9184         pop(in_out2);
9185         pop(in_out1);
9186         pop(tmp3);
9187 
9188     addl(in_out2, 2 * size);
9189     subl(in_out1, 3 * size);
9190     jmp(L_processPartitions);
9191 
9192   bind(L_exit);
9193 }
9194 #endif //LP64
9195 
9196 #ifdef _LP64
9197 // Algorithm 2: Pipelined usage of the CRC32 instruction.
9198 // Input: A buffer I of L bytes.
9199 // Output: the CRC32C value of the buffer.
9200 // Notations:
9201 // Write L = 24N + r, with N = floor (L/24).
9202 // r = L mod 24 (0 <= r < 24).
9203 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
9204 // N quadwords, and R consists of r bytes.
9205 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
9206 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
9207 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
9208 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
9209 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
9210                                           Register tmp1, Register tmp2, Register tmp3,
9211                                           Register tmp4, Register tmp5, Register tmp6,
9212                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9213                                           bool is_pclmulqdq_supported) {
9214   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
9215   Label L_wordByWord;
9216   Label L_byteByByteProlog;
9217   Label L_byteByByte;
9218   Label L_exit;
9219 
9220   if (is_pclmulqdq_supported ) {
9221     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
9222     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
9223 
9224     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
9225     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
9226 
9227     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
9228     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
9229     assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
9230   } else {
9231     const_or_pre_comp_const_index[0] = 1;
9232     const_or_pre_comp_const_index[1] = 0;
9233 
9234     const_or_pre_comp_const_index[2] = 3;
9235     const_or_pre_comp_const_index[3] = 2;
9236 
9237     const_or_pre_comp_const_index[4] = 5;
9238     const_or_pre_comp_const_index[5] = 4;
9239    }
9240   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
9241                     in2, in1, in_out,
9242                     tmp1, tmp2, tmp3,
9243                     w_xtmp1, w_xtmp2, w_xtmp3,
9244                     tmp4, tmp5,
9245                     tmp6);
9246   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
9247                     in2, in1, in_out,
9248                     tmp1, tmp2, tmp3,
9249                     w_xtmp1, w_xtmp2, w_xtmp3,
9250                     tmp4, tmp5,
9251                     tmp6);
9252   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
9253                     in2, in1, in_out,
9254                     tmp1, tmp2, tmp3,
9255                     w_xtmp1, w_xtmp2, w_xtmp3,
9256                     tmp4, tmp5,
9257                     tmp6);
9258   movl(tmp1, in2);
9259   andl(tmp1, 0x00000007);
9260   negl(tmp1);
9261   addl(tmp1, in2);
9262   addq(tmp1, in1);
9263 
9264   BIND(L_wordByWord);
9265   cmpq(in1, tmp1);
9266   jcc(Assembler::greaterEqual, L_byteByByteProlog);
9267     crc32(in_out, Address(in1, 0), 4);
9268     addq(in1, 4);
9269     jmp(L_wordByWord);
9270 
9271   BIND(L_byteByByteProlog);
9272   andl(in2, 0x00000007);
9273   movl(tmp2, 1);
9274 
9275   BIND(L_byteByByte);
9276   cmpl(tmp2, in2);
9277   jccb(Assembler::greater, L_exit);
9278     crc32(in_out, Address(in1, 0), 1);
9279     incq(in1);
9280     incl(tmp2);
9281     jmp(L_byteByByte);
9282 
9283   BIND(L_exit);
9284 }
9285 #else
9286 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
9287                                           Register tmp1, Register  tmp2, Register tmp3,
9288                                           Register tmp4, Register  tmp5, Register tmp6,
9289                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9290                                           bool is_pclmulqdq_supported) {
9291   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
9292   Label L_wordByWord;
9293   Label L_byteByByteProlog;
9294   Label L_byteByByte;
9295   Label L_exit;
9296 
9297   if (is_pclmulqdq_supported) {
9298     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
9299     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
9300 
9301     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
9302     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
9303 
9304     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
9305     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
9306   } else {
9307     const_or_pre_comp_const_index[0] = 1;
9308     const_or_pre_comp_const_index[1] = 0;
9309 
9310     const_or_pre_comp_const_index[2] = 3;
9311     const_or_pre_comp_const_index[3] = 2;
9312 
9313     const_or_pre_comp_const_index[4] = 5;
9314     const_or_pre_comp_const_index[5] = 4;
9315   }
9316   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
9317                     in2, in1, in_out,
9318                     tmp1, tmp2, tmp3,
9319                     w_xtmp1, w_xtmp2, w_xtmp3,
9320                     tmp4, tmp5,
9321                     tmp6);
9322   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
9323                     in2, in1, in_out,
9324                     tmp1, tmp2, tmp3,
9325                     w_xtmp1, w_xtmp2, w_xtmp3,
9326                     tmp4, tmp5,
9327                     tmp6);
9328   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
9329                     in2, in1, in_out,
9330                     tmp1, tmp2, tmp3,
9331                     w_xtmp1, w_xtmp2, w_xtmp3,
9332                     tmp4, tmp5,
9333                     tmp6);
9334   movl(tmp1, in2);
9335   andl(tmp1, 0x00000007);
9336   negl(tmp1);
9337   addl(tmp1, in2);
9338   addl(tmp1, in1);
9339 
9340   BIND(L_wordByWord);
9341   cmpl(in1, tmp1);
9342   jcc(Assembler::greaterEqual, L_byteByByteProlog);
9343     crc32(in_out, Address(in1,0), 4);
9344     addl(in1, 4);
9345     jmp(L_wordByWord);
9346 
9347   BIND(L_byteByByteProlog);
9348   andl(in2, 0x00000007);
9349   movl(tmp2, 1);
9350 
9351   BIND(L_byteByByte);
9352   cmpl(tmp2, in2);
9353   jccb(Assembler::greater, L_exit);
9354     movb(tmp1, Address(in1, 0));
9355     crc32(in_out, tmp1, 1);
9356     incl(in1);
9357     incl(tmp2);
9358     jmp(L_byteByByte);
9359 
9360   BIND(L_exit);
9361 }
9362 #endif // LP64
9363 #undef BIND
9364 #undef BLOCK_COMMENT
9365 
9366 // Compress char[] array to byte[].
9367 //   ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
9368 //   @HotSpotIntrinsicCandidate
9369 //   private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
9370 //     for (int i = 0; i < len; i++) {
9371 //       int c = src[srcOff++];
9372 //       if (c >>> 8 != 0) {
9373 //         return 0;
9374 //       }
9375 //       dst[dstOff++] = (byte)c;
9376 //     }
9377 //     return len;
9378 //   }
9379 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
9380   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
9381   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
9382   Register tmp5, Register result) {
9383   Label copy_chars_loop, return_length, return_zero, done;
9384 
9385   // rsi: src
9386   // rdi: dst
9387   // rdx: len
9388   // rcx: tmp5
9389   // rax: result
9390 
9391   // rsi holds start addr of source char[] to be compressed
9392   // rdi holds start addr of destination byte[]
9393   // rdx holds length
9394 
9395   assert(len != result, "");
9396 
9397   // save length for return
9398   push(len);
9399 
9400   if ((UseAVX > 2) && // AVX512
9401     VM_Version::supports_avx512vlbw() &&
9402     VM_Version::supports_bmi2()) {
9403 
9404     Label copy_32_loop, copy_loop_tail, below_threshold;
9405 
9406     // alignment
9407     Label post_alignment;
9408 
9409     // if length of the string is less than 16, handle it in an old fashioned way
9410     testl(len, -32);
9411     jcc(Assembler::zero, below_threshold);
9412 
9413     // First check whether a character is compressable ( <= 0xFF).
9414     // Create mask to test for Unicode chars inside zmm vector
9415     movl(result, 0x00FF);
9416     evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
9417 
9418     testl(len, -64);
9419     jcc(Assembler::zero, post_alignment);
9420 
9421     movl(tmp5, dst);
9422     andl(tmp5, (32 - 1));
9423     negl(tmp5);
9424     andl(tmp5, (32 - 1));
9425 
9426     // bail out when there is nothing to be done
9427     testl(tmp5, 0xFFFFFFFF);
9428     jcc(Assembler::zero, post_alignment);
9429 
9430     // ~(~0 << len), where len is the # of remaining elements to process
9431     movl(result, 0xFFFFFFFF);
9432     shlxl(result, result, tmp5);
9433     notl(result);
9434     kmovdl(k3, result);
9435 
9436     evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit);
9437     evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
9438     ktestd(k2, k3);
9439     jcc(Assembler::carryClear, return_zero);
9440 
9441     evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit);
9442 
9443     addptr(src, tmp5);
9444     addptr(src, tmp5);
9445     addptr(dst, tmp5);
9446     subl(len, tmp5);
9447 
9448     bind(post_alignment);
9449     // end of alignment
9450 
9451     movl(tmp5, len);
9452     andl(tmp5, (32 - 1));    // tail count (in chars)
9453     andl(len, ~(32 - 1));    // vector count (in chars)
9454     jcc(Assembler::zero, copy_loop_tail);
9455 
9456     lea(src, Address(src, len, Address::times_2));
9457     lea(dst, Address(dst, len, Address::times_1));
9458     negptr(len);
9459 
9460     bind(copy_32_loop);
9461     evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
9462     evpcmpuw(k2, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
9463     kortestdl(k2, k2);
9464     jcc(Assembler::carryClear, return_zero);
9465 
9466     // All elements in current processed chunk are valid candidates for
9467     // compression. Write a truncated byte elements to the memory.
9468     evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
9469     addptr(len, 32);
9470     jcc(Assembler::notZero, copy_32_loop);
9471 
9472     bind(copy_loop_tail);
9473     // bail out when there is nothing to be done
9474     testl(tmp5, 0xFFFFFFFF);
9475     jcc(Assembler::zero, return_length);
9476 
9477     movl(len, tmp5);
9478 
9479     // ~(~0 << len), where len is the # of remaining elements to process
9480     movl(result, 0xFFFFFFFF);
9481     shlxl(result, result, len);
9482     notl(result);
9483 
9484     kmovdl(k3, result);
9485 
9486     evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit);
9487     evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
9488     ktestd(k2, k3);
9489     jcc(Assembler::carryClear, return_zero);
9490 
9491     evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit);
9492     jmp(return_length);
9493 
9494     bind(below_threshold);
9495   }
9496 
9497   if (UseSSE42Intrinsics) {
9498     Label copy_32_loop, copy_16, copy_tail;
9499 
9500     movl(result, len);
9501 
9502     movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
9503 
9504     // vectored compression
9505     andl(len, 0xfffffff0);    // vector count (in chars)
9506     andl(result, 0x0000000f);    // tail count (in chars)
9507     testl(len, len);
9508     jcc(Assembler::zero, copy_16);
9509 
9510     // compress 16 chars per iter
9511     movdl(tmp1Reg, tmp5);
9512     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
9513     pxor(tmp4Reg, tmp4Reg);
9514 
9515     lea(src, Address(src, len, Address::times_2));
9516     lea(dst, Address(dst, len, Address::times_1));
9517     negptr(len);
9518 
9519     bind(copy_32_loop);
9520     movdqu(tmp2Reg, Address(src, len, Address::times_2));     // load 1st 8 characters
9521     por(tmp4Reg, tmp2Reg);
9522     movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
9523     por(tmp4Reg, tmp3Reg);
9524     ptest(tmp4Reg, tmp1Reg);       // check for Unicode chars in next vector
9525     jcc(Assembler::notZero, return_zero);
9526     packuswb(tmp2Reg, tmp3Reg);    // only ASCII chars; compress each to 1 byte
9527     movdqu(Address(dst, len, Address::times_1), tmp2Reg);
9528     addptr(len, 16);
9529     jcc(Assembler::notZero, copy_32_loop);
9530 
9531     // compress next vector of 8 chars (if any)
9532     bind(copy_16);
9533     movl(len, result);
9534     andl(len, 0xfffffff8);    // vector count (in chars)
9535     andl(result, 0x00000007);    // tail count (in chars)
9536     testl(len, len);
9537     jccb(Assembler::zero, copy_tail);
9538 
9539     movdl(tmp1Reg, tmp5);
9540     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
9541     pxor(tmp3Reg, tmp3Reg);
9542 
9543     movdqu(tmp2Reg, Address(src, 0));
9544     ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in vector
9545     jccb(Assembler::notZero, return_zero);
9546     packuswb(tmp2Reg, tmp3Reg);    // only LATIN1 chars; compress each to 1 byte
9547     movq(Address(dst, 0), tmp2Reg);
9548     addptr(src, 16);
9549     addptr(dst, 8);
9550 
9551     bind(copy_tail);
9552     movl(len, result);
9553   }
9554   // compress 1 char per iter
9555   testl(len, len);
9556   jccb(Assembler::zero, return_length);
9557   lea(src, Address(src, len, Address::times_2));
9558   lea(dst, Address(dst, len, Address::times_1));
9559   negptr(len);
9560 
9561   bind(copy_chars_loop);
9562   load_unsigned_short(result, Address(src, len, Address::times_2));
9563   testl(result, 0xff00);      // check if Unicode char
9564   jccb(Assembler::notZero, return_zero);
9565   movb(Address(dst, len, Address::times_1), result);  // ASCII char; compress to 1 byte
9566   increment(len);
9567   jcc(Assembler::notZero, copy_chars_loop);
9568 
9569   // if compression succeeded, return length
9570   bind(return_length);
9571   pop(result);
9572   jmpb(done);
9573 
9574   // if compression failed, return 0
9575   bind(return_zero);
9576   xorl(result, result);
9577   addptr(rsp, wordSize);
9578 
9579   bind(done);
9580 }
9581 
9582 // Inflate byte[] array to char[].
9583 //   ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
9584 //   @HotSpotIntrinsicCandidate
9585 //   private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
9586 //     for (int i = 0; i < len; i++) {
9587 //       dst[dstOff++] = (char)(src[srcOff++] & 0xff);
9588 //     }
9589 //   }
9590 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
9591   XMMRegister tmp1, Register tmp2) {
9592   Label copy_chars_loop, done, below_threshold;
9593   // rsi: src
9594   // rdi: dst
9595   // rdx: len
9596   // rcx: tmp2
9597 
9598   // rsi holds start addr of source byte[] to be inflated
9599   // rdi holds start addr of destination char[]
9600   // rdx holds length
9601   assert_different_registers(src, dst, len, tmp2);
9602 
9603   if ((UseAVX > 2) && // AVX512
9604     VM_Version::supports_avx512vlbw() &&
9605     VM_Version::supports_bmi2()) {
9606 
9607     Label copy_32_loop, copy_tail;
9608     Register tmp3_aliased = len;
9609 
9610     // if length of the string is less than 16, handle it in an old fashioned way
9611     testl(len, -16);
9612     jcc(Assembler::zero, below_threshold);
9613 
9614     // In order to use only one arithmetic operation for the main loop we use
9615     // this pre-calculation
9616     movl(tmp2, len);
9617     andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
9618     andl(len, -32);     // vector count
9619     jccb(Assembler::zero, copy_tail);
9620 
9621     lea(src, Address(src, len, Address::times_1));
9622     lea(dst, Address(dst, len, Address::times_2));
9623     negptr(len);
9624 
9625 
9626     // inflate 32 chars per iter
9627     bind(copy_32_loop);
9628     vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
9629     evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
9630     addptr(len, 32);
9631     jcc(Assembler::notZero, copy_32_loop);
9632 
9633     bind(copy_tail);
9634     // bail out when there is nothing to be done
9635     testl(tmp2, -1); // we don't destroy the contents of tmp2 here
9636     jcc(Assembler::zero, done);
9637 
9638     // ~(~0 << length), where length is the # of remaining elements to process
9639     movl(tmp3_aliased, -1);
9640     shlxl(tmp3_aliased, tmp3_aliased, tmp2);
9641     notl(tmp3_aliased);
9642     kmovdl(k2, tmp3_aliased);
9643     evpmovzxbw(tmp1, k2, Address(src, 0), Assembler::AVX_512bit);
9644     evmovdquw(Address(dst, 0), k2, tmp1, Assembler::AVX_512bit);
9645 
9646     jmp(done);
9647   }
9648   if (UseSSE42Intrinsics) {
9649     Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
9650 
9651     movl(tmp2, len);
9652 
9653     if (UseAVX > 1) {
9654       andl(tmp2, (16 - 1));
9655       andl(len, -16);
9656       jccb(Assembler::zero, copy_new_tail);
9657     } else {
9658       andl(tmp2, 0x00000007);   // tail count (in chars)
9659       andl(len, 0xfffffff8);    // vector count (in chars)
9660       jccb(Assembler::zero, copy_tail);
9661     }
9662 
9663     // vectored inflation
9664     lea(src, Address(src, len, Address::times_1));
9665     lea(dst, Address(dst, len, Address::times_2));
9666     negptr(len);
9667 
9668     if (UseAVX > 1) {
9669       bind(copy_16_loop);
9670       vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
9671       vmovdqu(Address(dst, len, Address::times_2), tmp1);
9672       addptr(len, 16);
9673       jcc(Assembler::notZero, copy_16_loop);
9674 
9675       bind(below_threshold);
9676       bind(copy_new_tail);
9677       if ((UseAVX > 2) &&
9678         VM_Version::supports_avx512vlbw() &&
9679         VM_Version::supports_bmi2()) {
9680         movl(tmp2, len);
9681       } else {
9682         movl(len, tmp2);
9683       }
9684       andl(tmp2, 0x00000007);
9685       andl(len, 0xFFFFFFF8);
9686       jccb(Assembler::zero, copy_tail);
9687 
9688       pmovzxbw(tmp1, Address(src, 0));
9689       movdqu(Address(dst, 0), tmp1);
9690       addptr(src, 8);
9691       addptr(dst, 2 * 8);
9692 
9693       jmp(copy_tail, true);
9694     }
9695 
9696     // inflate 8 chars per iter
9697     bind(copy_8_loop);
9698     pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
9699     movdqu(Address(dst, len, Address::times_2), tmp1);
9700     addptr(len, 8);
9701     jcc(Assembler::notZero, copy_8_loop);
9702 
9703     bind(copy_tail);
9704     movl(len, tmp2);
9705 
9706     cmpl(len, 4);
9707     jccb(Assembler::less, copy_bytes);
9708 
9709     movdl(tmp1, Address(src, 0));  // load 4 byte chars
9710     pmovzxbw(tmp1, tmp1);
9711     movq(Address(dst, 0), tmp1);
9712     subptr(len, 4);
9713     addptr(src, 4);
9714     addptr(dst, 8);
9715 
9716     bind(copy_bytes);
9717   } else {
9718     bind(below_threshold);
9719   }
9720 
9721   testl(len, len);
9722   jccb(Assembler::zero, done);
9723   lea(src, Address(src, len, Address::times_1));
9724   lea(dst, Address(dst, len, Address::times_2));
9725   negptr(len);
9726 
9727   // inflate 1 char per iter
9728   bind(copy_chars_loop);
9729   load_unsigned_byte(tmp2, Address(src, len, Address::times_1));  // load byte char
9730   movw(Address(dst, len, Address::times_2), tmp2);  // inflate byte char to word
9731   increment(len);
9732   jcc(Assembler::notZero, copy_chars_loop);
9733 
9734   bind(done);
9735 }
9736 
9737 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
9738   switch (cond) {
9739     // Note some conditions are synonyms for others
9740     case Assembler::zero:         return Assembler::notZero;
9741     case Assembler::notZero:      return Assembler::zero;
9742     case Assembler::less:         return Assembler::greaterEqual;
9743     case Assembler::lessEqual:    return Assembler::greater;
9744     case Assembler::greater:      return Assembler::lessEqual;
9745     case Assembler::greaterEqual: return Assembler::less;
9746     case Assembler::below:        return Assembler::aboveEqual;
9747     case Assembler::belowEqual:   return Assembler::above;
9748     case Assembler::above:        return Assembler::belowEqual;
9749     case Assembler::aboveEqual:   return Assembler::below;
9750     case Assembler::overflow:     return Assembler::noOverflow;
9751     case Assembler::noOverflow:   return Assembler::overflow;
9752     case Assembler::negative:     return Assembler::positive;
9753     case Assembler::positive:     return Assembler::negative;
9754     case Assembler::parity:       return Assembler::noParity;
9755     case Assembler::noParity:     return Assembler::parity;
9756   }
9757   ShouldNotReachHere(); return Assembler::overflow;
9758 }
9759 
9760 SkipIfEqual::SkipIfEqual(
9761     MacroAssembler* masm, const bool* flag_addr, bool value) {
9762   _masm = masm;
9763   _masm->cmp8(ExternalAddress((address)flag_addr), value);
9764   _masm->jcc(Assembler::equal, _label);
9765 }
9766 
9767 SkipIfEqual::~SkipIfEqual() {
9768   _masm->bind(_label);
9769 }
9770 
9771 // 32-bit Windows has its own fast-path implementation
9772 // of get_thread
9773 #if !defined(WIN32) || defined(_LP64)
9774 
9775 // This is simply a call to Thread::current()
9776 void MacroAssembler::get_thread(Register thread) {
9777   if (thread != rax) {
9778     push(rax);
9779   }
9780   LP64_ONLY(push(rdi);)
9781   LP64_ONLY(push(rsi);)
9782   push(rdx);
9783   push(rcx);
9784 #ifdef _LP64
9785   push(r8);
9786   push(r9);
9787   push(r10);
9788   push(r11);
9789 #endif
9790 
9791   MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
9792 
9793 #ifdef _LP64
9794   pop(r11);
9795   pop(r10);
9796   pop(r9);
9797   pop(r8);
9798 #endif
9799   pop(rcx);
9800   pop(rdx);
9801   LP64_ONLY(pop(rsi);)
9802   LP64_ONLY(pop(rdi);)
9803   if (thread != rax) {
9804     mov(thread, rax);
9805     pop(rax);
9806   }
9807 }
9808 
9809 #endif