1 /*
   2  * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "jvm.h"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "compiler/disassembler.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "gc/shared/collectedHeap.inline.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 #include "memory/resourceArea.hpp"
  35 #include "memory/universe.hpp"
  36 #include "oops/accessDecorators.hpp"
  37 #include "oops/compressedOops.inline.hpp"
  38 #include "oops/klass.inline.hpp"
  39 #include "prims/methodHandles.hpp"
  40 #include "runtime/biasedLocking.hpp"
  41 #include "runtime/flags/flagSetting.hpp"
  42 #include "runtime/interfaceSupport.inline.hpp"
  43 #include "runtime/objectMonitor.hpp"
  44 #include "runtime/os.hpp"
  45 #include "runtime/safepoint.hpp"
  46 #include "runtime/safepointMechanism.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/stubRoutines.hpp"
  49 #include "runtime/thread.hpp"
  50 #include "utilities/macros.hpp"
  51 #include "vmreg_x86.inline.hpp"
  52 #include "crc32c.h"
  53 #ifdef COMPILER2
  54 #include "opto/intrinsicnode.hpp"
  55 #endif
  56 
  57 #ifdef PRODUCT
  58 #define BLOCK_COMMENT(str) /* nothing */
  59 #define STOP(error) stop(error)
  60 #else
  61 #define BLOCK_COMMENT(str) block_comment(str)
  62 #define STOP(error) block_comment(error); stop(error)
  63 #endif
  64 
  65 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  66 
  67 #ifdef ASSERT
  68 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
  69 #endif
  70 
  71 static Assembler::Condition reverse[] = {
  72     Assembler::noOverflow     /* overflow      = 0x0 */ ,
  73     Assembler::overflow       /* noOverflow    = 0x1 */ ,
  74     Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
  75     Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
  76     Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
  77     Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
  78     Assembler::above          /* belowEqual    = 0x6 */ ,
  79     Assembler::belowEqual     /* above         = 0x7 */ ,
  80     Assembler::positive       /* negative      = 0x8 */ ,
  81     Assembler::negative       /* positive      = 0x9 */ ,
  82     Assembler::noParity       /* parity        = 0xa */ ,
  83     Assembler::parity         /* noParity      = 0xb */ ,
  84     Assembler::greaterEqual   /* less          = 0xc */ ,
  85     Assembler::less           /* greaterEqual  = 0xd */ ,
  86     Assembler::greater        /* lessEqual     = 0xe */ ,
  87     Assembler::lessEqual      /* greater       = 0xf, */
  88 
  89 };
  90 
  91 
  92 // Implementation of MacroAssembler
  93 
  94 // First all the versions that have distinct versions depending on 32/64 bit
  95 // Unless the difference is trivial (1 line or so).
  96 
  97 #ifndef _LP64
  98 
  99 // 32bit versions
 100 
 101 Address MacroAssembler::as_Address(AddressLiteral adr) {
 102   return Address(adr.target(), adr.rspec());
 103 }
 104 
 105 Address MacroAssembler::as_Address(ArrayAddress adr) {
 106   return Address::make_array(adr);
 107 }
 108 
 109 void MacroAssembler::call_VM_leaf_base(address entry_point,
 110                                        int number_of_arguments) {
 111   call(RuntimeAddress(entry_point));
 112   increment(rsp, number_of_arguments * wordSize);
 113 }
 114 
 115 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
 116   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 117 }
 118 
 119 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
 120   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 121 }
 122 
 123 void MacroAssembler::cmpoop_raw(Address src1, jobject obj) {
 124   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 125 }
 126 
 127 void MacroAssembler::cmpoop_raw(Register src1, jobject obj) {
 128   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 129 }
 130 
 131 void MacroAssembler::cmpoop(Address src1, jobject obj) {
 132   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 133   bs->obj_equals(this, src1, obj);
 134 }
 135 
 136 void MacroAssembler::cmpoop(Register src1, jobject obj) {
 137   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 138   bs->obj_equals(this, src1, obj);
 139 }
 140 
 141 void MacroAssembler::extend_sign(Register hi, Register lo) {
 142   // According to Intel Doc. AP-526, "Integer Divide", p.18.
 143   if (VM_Version::is_P6() && hi == rdx && lo == rax) {
 144     cdql();
 145   } else {
 146     movl(hi, lo);
 147     sarl(hi, 31);
 148   }
 149 }
 150 
 151 void MacroAssembler::jC2(Register tmp, Label& L) {
 152   // set parity bit if FPU flag C2 is set (via rax)
 153   save_rax(tmp);
 154   fwait(); fnstsw_ax();
 155   sahf();
 156   restore_rax(tmp);
 157   // branch
 158   jcc(Assembler::parity, L);
 159 }
 160 
 161 void MacroAssembler::jnC2(Register tmp, Label& L) {
 162   // set parity bit if FPU flag C2 is set (via rax)
 163   save_rax(tmp);
 164   fwait(); fnstsw_ax();
 165   sahf();
 166   restore_rax(tmp);
 167   // branch
 168   jcc(Assembler::noParity, L);
 169 }
 170 
 171 // 32bit can do a case table jump in one instruction but we no longer allow the base
 172 // to be installed in the Address class
 173 void MacroAssembler::jump(ArrayAddress entry) {
 174   jmp(as_Address(entry));
 175 }
 176 
 177 // Note: y_lo will be destroyed
 178 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 179   // Long compare for Java (semantics as described in JVM spec.)
 180   Label high, low, done;
 181 
 182   cmpl(x_hi, y_hi);
 183   jcc(Assembler::less, low);
 184   jcc(Assembler::greater, high);
 185   // x_hi is the return register
 186   xorl(x_hi, x_hi);
 187   cmpl(x_lo, y_lo);
 188   jcc(Assembler::below, low);
 189   jcc(Assembler::equal, done);
 190 
 191   bind(high);
 192   xorl(x_hi, x_hi);
 193   increment(x_hi);
 194   jmp(done);
 195 
 196   bind(low);
 197   xorl(x_hi, x_hi);
 198   decrementl(x_hi);
 199 
 200   bind(done);
 201 }
 202 
 203 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 204     mov_literal32(dst, (int32_t)src.target(), src.rspec());
 205 }
 206 
 207 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 208   // leal(dst, as_Address(adr));
 209   // see note in movl as to why we must use a move
 210   mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
 211 }
 212 
 213 void MacroAssembler::leave() {
 214   mov(rsp, rbp);
 215   pop(rbp);
 216 }
 217 
 218 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
 219   // Multiplication of two Java long values stored on the stack
 220   // as illustrated below. Result is in rdx:rax.
 221   //
 222   // rsp ---> [  ??  ] \               \
 223   //            ....    | y_rsp_offset  |
 224   //          [ y_lo ] /  (in bytes)    | x_rsp_offset
 225   //          [ y_hi ]                  | (in bytes)
 226   //            ....                    |
 227   //          [ x_lo ]                 /
 228   //          [ x_hi ]
 229   //            ....
 230   //
 231   // Basic idea: lo(result) = lo(x_lo * y_lo)
 232   //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
 233   Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
 234   Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
 235   Label quick;
 236   // load x_hi, y_hi and check if quick
 237   // multiplication is possible
 238   movl(rbx, x_hi);
 239   movl(rcx, y_hi);
 240   movl(rax, rbx);
 241   orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
 242   jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
 243   // do full multiplication
 244   // 1st step
 245   mull(y_lo);                                    // x_hi * y_lo
 246   movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
 247   // 2nd step
 248   movl(rax, x_lo);
 249   mull(rcx);                                     // x_lo * y_hi
 250   addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
 251   // 3rd step
 252   bind(quick);                                   // note: rbx, = 0 if quick multiply!
 253   movl(rax, x_lo);
 254   mull(y_lo);                                    // x_lo * y_lo
 255   addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
 256 }
 257 
 258 void MacroAssembler::lneg(Register hi, Register lo) {
 259   negl(lo);
 260   adcl(hi, 0);
 261   negl(hi);
 262 }
 263 
 264 void MacroAssembler::lshl(Register hi, Register lo) {
 265   // Java shift left long support (semantics as described in JVM spec., p.305)
 266   // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
 267   // shift value is in rcx !
 268   assert(hi != rcx, "must not use rcx");
 269   assert(lo != rcx, "must not use rcx");
 270   const Register s = rcx;                        // shift count
 271   const int      n = BitsPerWord;
 272   Label L;
 273   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 274   cmpl(s, n);                                    // if (s < n)
 275   jcc(Assembler::less, L);                       // else (s >= n)
 276   movl(hi, lo);                                  // x := x << n
 277   xorl(lo, lo);
 278   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 279   bind(L);                                       // s (mod n) < n
 280   shldl(hi, lo);                                 // x := x << s
 281   shll(lo);
 282 }
 283 
 284 
 285 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
 286   // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
 287   // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
 288   assert(hi != rcx, "must not use rcx");
 289   assert(lo != rcx, "must not use rcx");
 290   const Register s = rcx;                        // shift count
 291   const int      n = BitsPerWord;
 292   Label L;
 293   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 294   cmpl(s, n);                                    // if (s < n)
 295   jcc(Assembler::less, L);                       // else (s >= n)
 296   movl(lo, hi);                                  // x := x >> n
 297   if (sign_extension) sarl(hi, 31);
 298   else                xorl(hi, hi);
 299   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 300   bind(L);                                       // s (mod n) < n
 301   shrdl(lo, hi);                                 // x := x >> s
 302   if (sign_extension) sarl(hi);
 303   else                shrl(hi);
 304 }
 305 
 306 void MacroAssembler::movoop(Register dst, jobject obj) {
 307   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 308 }
 309 
 310 void MacroAssembler::movoop(Address dst, jobject obj) {
 311   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 312 }
 313 
 314 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 315   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 316 }
 317 
 318 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 319   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 320 }
 321 
 322 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 323   // scratch register is not used,
 324   // it is defined to match parameters of 64-bit version of this method.
 325   if (src.is_lval()) {
 326     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
 327   } else {
 328     movl(dst, as_Address(src));
 329   }
 330 }
 331 
 332 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 333   movl(as_Address(dst), src);
 334 }
 335 
 336 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 337   movl(dst, as_Address(src));
 338 }
 339 
 340 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 341 void MacroAssembler::movptr(Address dst, intptr_t src) {
 342   movl(dst, src);
 343 }
 344 
 345 
 346 void MacroAssembler::pop_callee_saved_registers() {
 347   pop(rcx);
 348   pop(rdx);
 349   pop(rdi);
 350   pop(rsi);
 351 }
 352 
 353 void MacroAssembler::pop_fTOS() {
 354   fld_d(Address(rsp, 0));
 355   addl(rsp, 2 * wordSize);
 356 }
 357 
 358 void MacroAssembler::push_callee_saved_registers() {
 359   push(rsi);
 360   push(rdi);
 361   push(rdx);
 362   push(rcx);
 363 }
 364 
 365 void MacroAssembler::push_fTOS() {
 366   subl(rsp, 2 * wordSize);
 367   fstp_d(Address(rsp, 0));
 368 }
 369 
 370 
 371 void MacroAssembler::pushoop(jobject obj) {
 372   push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
 373 }
 374 
 375 void MacroAssembler::pushklass(Metadata* obj) {
 376   push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
 377 }
 378 
 379 void MacroAssembler::pushptr(AddressLiteral src) {
 380   if (src.is_lval()) {
 381     push_literal32((int32_t)src.target(), src.rspec());
 382   } else {
 383     pushl(as_Address(src));
 384   }
 385 }
 386 
 387 void MacroAssembler::set_word_if_not_zero(Register dst) {
 388   xorl(dst, dst);
 389   set_byte_if_not_zero(dst);
 390 }
 391 
 392 static void pass_arg0(MacroAssembler* masm, Register arg) {
 393   masm->push(arg);
 394 }
 395 
 396 static void pass_arg1(MacroAssembler* masm, Register arg) {
 397   masm->push(arg);
 398 }
 399 
 400 static void pass_arg2(MacroAssembler* masm, Register arg) {
 401   masm->push(arg);
 402 }
 403 
 404 static void pass_arg3(MacroAssembler* masm, Register arg) {
 405   masm->push(arg);
 406 }
 407 
 408 #ifndef PRODUCT
 409 extern "C" void findpc(intptr_t x);
 410 #endif
 411 
 412 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
 413   // In order to get locks to work, we need to fake a in_VM state
 414   JavaThread* thread = JavaThread::current();
 415   JavaThreadState saved_state = thread->thread_state();
 416   thread->set_thread_state(_thread_in_vm);
 417   if (ShowMessageBoxOnError) {
 418     JavaThread* thread = JavaThread::current();
 419     JavaThreadState saved_state = thread->thread_state();
 420     thread->set_thread_state(_thread_in_vm);
 421     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 422       ttyLocker ttyl;
 423       BytecodeCounter::print();
 424     }
 425     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 426     // This is the value of eip which points to where verify_oop will return.
 427     if (os::message_box(msg, "Execution stopped, print registers?")) {
 428       print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
 429       BREAKPOINT;
 430     }
 431   } else {
 432     ttyLocker ttyl;
 433     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
 434   }
 435   // Don't assert holding the ttyLock
 436     assert(false, "DEBUG MESSAGE: %s", msg);
 437   ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
 438 }
 439 
 440 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
 441   ttyLocker ttyl;
 442   FlagSetting fs(Debugging, true);
 443   tty->print_cr("eip = 0x%08x", eip);
 444 #ifndef PRODUCT
 445   if ((WizardMode || Verbose) && PrintMiscellaneous) {
 446     tty->cr();
 447     findpc(eip);
 448     tty->cr();
 449   }
 450 #endif
 451 #define PRINT_REG(rax) \
 452   { tty->print("%s = ", #rax); os::print_location(tty, rax); }
 453   PRINT_REG(rax);
 454   PRINT_REG(rbx);
 455   PRINT_REG(rcx);
 456   PRINT_REG(rdx);
 457   PRINT_REG(rdi);
 458   PRINT_REG(rsi);
 459   PRINT_REG(rbp);
 460   PRINT_REG(rsp);
 461 #undef PRINT_REG
 462   // Print some words near top of staack.
 463   int* dump_sp = (int*) rsp;
 464   for (int col1 = 0; col1 < 8; col1++) {
 465     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 466     os::print_location(tty, *dump_sp++);
 467   }
 468   for (int row = 0; row < 16; row++) {
 469     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 470     for (int col = 0; col < 8; col++) {
 471       tty->print(" 0x%08x", *dump_sp++);
 472     }
 473     tty->cr();
 474   }
 475   // Print some instructions around pc:
 476   Disassembler::decode((address)eip-64, (address)eip);
 477   tty->print_cr("--------");
 478   Disassembler::decode((address)eip, (address)eip+32);
 479 }
 480 
 481 void MacroAssembler::stop(const char* msg) {
 482   ExternalAddress message((address)msg);
 483   // push address of message
 484   pushptr(message.addr());
 485   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 486   pusha();                                            // push registers
 487   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
 488   hlt();
 489 }
 490 
 491 void MacroAssembler::warn(const char* msg) {
 492   push_CPU_state();
 493 
 494   ExternalAddress message((address) msg);
 495   // push address of message
 496   pushptr(message.addr());
 497 
 498   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
 499   addl(rsp, wordSize);       // discard argument
 500   pop_CPU_state();
 501 }
 502 
 503 void MacroAssembler::print_state() {
 504   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 505   pusha();                                            // push registers
 506 
 507   push_CPU_state();
 508   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
 509   pop_CPU_state();
 510 
 511   popa();
 512   addl(rsp, wordSize);
 513 }
 514 
 515 #else // _LP64
 516 
 517 // 64 bit versions
 518 
 519 Address MacroAssembler::as_Address(AddressLiteral adr) {
 520   // amd64 always does this as a pc-rel
 521   // we can be absolute or disp based on the instruction type
 522   // jmp/call are displacements others are absolute
 523   assert(!adr.is_lval(), "must be rval");
 524   assert(reachable(adr), "must be");
 525   return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
 526 
 527 }
 528 
 529 Address MacroAssembler::as_Address(ArrayAddress adr) {
 530   AddressLiteral base = adr.base();
 531   lea(rscratch1, base);
 532   Address index = adr.index();
 533   assert(index._disp == 0, "must not have disp"); // maybe it can?
 534   Address array(rscratch1, index._index, index._scale, index._disp);
 535   return array;
 536 }
 537 
 538 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
 539   Label L, E;
 540 
 541 #ifdef _WIN64
 542   // Windows always allocates space for it's register args
 543   assert(num_args <= 4, "only register arguments supported");
 544   subq(rsp,  frame::arg_reg_save_area_bytes);
 545 #endif
 546 
 547   // Align stack if necessary
 548   testl(rsp, 15);
 549   jcc(Assembler::zero, L);
 550 
 551   subq(rsp, 8);
 552   {
 553     call(RuntimeAddress(entry_point));
 554   }
 555   addq(rsp, 8);
 556   jmp(E);
 557 
 558   bind(L);
 559   {
 560     call(RuntimeAddress(entry_point));
 561   }
 562 
 563   bind(E);
 564 
 565 #ifdef _WIN64
 566   // restore stack pointer
 567   addq(rsp, frame::arg_reg_save_area_bytes);
 568 #endif
 569 
 570 }
 571 
 572 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
 573   assert(!src2.is_lval(), "should use cmpptr");
 574 
 575   if (reachable(src2)) {
 576     cmpq(src1, as_Address(src2));
 577   } else {
 578     lea(rscratch1, src2);
 579     Assembler::cmpq(src1, Address(rscratch1, 0));
 580   }
 581 }
 582 
 583 int MacroAssembler::corrected_idivq(Register reg) {
 584   // Full implementation of Java ldiv and lrem; checks for special
 585   // case as described in JVM spec., p.243 & p.271.  The function
 586   // returns the (pc) offset of the idivl instruction - may be needed
 587   // for implicit exceptions.
 588   //
 589   //         normal case                           special case
 590   //
 591   // input : rax: dividend                         min_long
 592   //         reg: divisor   (may not be eax/edx)   -1
 593   //
 594   // output: rax: quotient  (= rax idiv reg)       min_long
 595   //         rdx: remainder (= rax irem reg)       0
 596   assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
 597   static const int64_t min_long = 0x8000000000000000;
 598   Label normal_case, special_case;
 599 
 600   // check for special case
 601   cmp64(rax, ExternalAddress((address) &min_long));
 602   jcc(Assembler::notEqual, normal_case);
 603   xorl(rdx, rdx); // prepare rdx for possible special case (where
 604                   // remainder = 0)
 605   cmpq(reg, -1);
 606   jcc(Assembler::equal, special_case);
 607 
 608   // handle normal case
 609   bind(normal_case);
 610   cdqq();
 611   int idivq_offset = offset();
 612   idivq(reg);
 613 
 614   // normal and special case exit
 615   bind(special_case);
 616 
 617   return idivq_offset;
 618 }
 619 
 620 void MacroAssembler::decrementq(Register reg, int value) {
 621   if (value == min_jint) { subq(reg, value); return; }
 622   if (value <  0) { incrementq(reg, -value); return; }
 623   if (value == 0) {                        ; return; }
 624   if (value == 1 && UseIncDec) { decq(reg) ; return; }
 625   /* else */      { subq(reg, value)       ; return; }
 626 }
 627 
 628 void MacroAssembler::decrementq(Address dst, int value) {
 629   if (value == min_jint) { subq(dst, value); return; }
 630   if (value <  0) { incrementq(dst, -value); return; }
 631   if (value == 0) {                        ; return; }
 632   if (value == 1 && UseIncDec) { decq(dst) ; return; }
 633   /* else */      { subq(dst, value)       ; return; }
 634 }
 635 
 636 void MacroAssembler::incrementq(AddressLiteral dst) {
 637   if (reachable(dst)) {
 638     incrementq(as_Address(dst));
 639   } else {
 640     lea(rscratch1, dst);
 641     incrementq(Address(rscratch1, 0));
 642   }
 643 }
 644 
 645 void MacroAssembler::incrementq(Register reg, int value) {
 646   if (value == min_jint) { addq(reg, value); return; }
 647   if (value <  0) { decrementq(reg, -value); return; }
 648   if (value == 0) {                        ; return; }
 649   if (value == 1 && UseIncDec) { incq(reg) ; return; }
 650   /* else */      { addq(reg, value)       ; return; }
 651 }
 652 
 653 void MacroAssembler::incrementq(Address dst, int value) {
 654   if (value == min_jint) { addq(dst, value); return; }
 655   if (value <  0) { decrementq(dst, -value); return; }
 656   if (value == 0) {                        ; return; }
 657   if (value == 1 && UseIncDec) { incq(dst) ; return; }
 658   /* else */      { addq(dst, value)       ; return; }
 659 }
 660 
 661 // 32bit can do a case table jump in one instruction but we no longer allow the base
 662 // to be installed in the Address class
 663 void MacroAssembler::jump(ArrayAddress entry) {
 664   lea(rscratch1, entry.base());
 665   Address dispatch = entry.index();
 666   assert(dispatch._base == noreg, "must be");
 667   dispatch._base = rscratch1;
 668   jmp(dispatch);
 669 }
 670 
 671 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 672   ShouldNotReachHere(); // 64bit doesn't use two regs
 673   cmpq(x_lo, y_lo);
 674 }
 675 
 676 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 677     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 678 }
 679 
 680 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 681   mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
 682   movptr(dst, rscratch1);
 683 }
 684 
 685 void MacroAssembler::leave() {
 686   // %%% is this really better? Why not on 32bit too?
 687   emit_int8((unsigned char)0xC9); // LEAVE
 688 }
 689 
 690 void MacroAssembler::lneg(Register hi, Register lo) {
 691   ShouldNotReachHere(); // 64bit doesn't use two regs
 692   negq(lo);
 693 }
 694 
 695 void MacroAssembler::movoop(Register dst, jobject obj) {
 696   mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 697 }
 698 
 699 void MacroAssembler::movoop(Address dst, jobject obj) {
 700   mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 701   movq(dst, rscratch1);
 702 }
 703 
 704 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 705   mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 706 }
 707 
 708 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 709   mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 710   movq(dst, rscratch1);
 711 }
 712 
 713 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 714   if (src.is_lval()) {
 715     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 716   } else {
 717     if (reachable(src)) {
 718       movq(dst, as_Address(src));
 719     } else {
 720       lea(scratch, src);
 721       movq(dst, Address(scratch, 0));
 722     }
 723   }
 724 }
 725 
 726 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 727   movq(as_Address(dst), src);
 728 }
 729 
 730 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 731   movq(dst, as_Address(src));
 732 }
 733 
 734 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 735 void MacroAssembler::movptr(Address dst, intptr_t src) {
 736   mov64(rscratch1, src);
 737   movq(dst, rscratch1);
 738 }
 739 
 740 // These are mostly for initializing NULL
 741 void MacroAssembler::movptr(Address dst, int32_t src) {
 742   movslq(dst, src);
 743 }
 744 
 745 void MacroAssembler::movptr(Register dst, int32_t src) {
 746   mov64(dst, (intptr_t)src);
 747 }
 748 
 749 void MacroAssembler::pushoop(jobject obj) {
 750   movoop(rscratch1, obj);
 751   push(rscratch1);
 752 }
 753 
 754 void MacroAssembler::pushklass(Metadata* obj) {
 755   mov_metadata(rscratch1, obj);
 756   push(rscratch1);
 757 }
 758 
 759 void MacroAssembler::pushptr(AddressLiteral src) {
 760   lea(rscratch1, src);
 761   if (src.is_lval()) {
 762     push(rscratch1);
 763   } else {
 764     pushq(Address(rscratch1, 0));
 765   }
 766 }
 767 
 768 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 769   // we must set sp to zero to clear frame
 770   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
 771   // must clear fp, so that compiled frames are not confused; it is
 772   // possible that we need it only for debugging
 773   if (clear_fp) {
 774     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
 775   }
 776 
 777   // Always clear the pc because it could have been set by make_walkable()
 778   movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
 779   vzeroupper();
 780 }
 781 
 782 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 783                                          Register last_java_fp,
 784                                          address  last_java_pc) {
 785   vzeroupper();
 786   // determine last_java_sp register
 787   if (!last_java_sp->is_valid()) {
 788     last_java_sp = rsp;
 789   }
 790 
 791   // last_java_fp is optional
 792   if (last_java_fp->is_valid()) {
 793     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
 794            last_java_fp);
 795   }
 796 
 797   // last_java_pc is optional
 798   if (last_java_pc != NULL) {
 799     Address java_pc(r15_thread,
 800                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
 801     lea(rscratch1, InternalAddress(last_java_pc));
 802     movptr(java_pc, rscratch1);
 803   }
 804 
 805   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
 806 }
 807 
 808 static void pass_arg0(MacroAssembler* masm, Register arg) {
 809   if (c_rarg0 != arg ) {
 810     masm->mov(c_rarg0, arg);
 811   }
 812 }
 813 
 814 static void pass_arg1(MacroAssembler* masm, Register arg) {
 815   if (c_rarg1 != arg ) {
 816     masm->mov(c_rarg1, arg);
 817   }
 818 }
 819 
 820 static void pass_arg2(MacroAssembler* masm, Register arg) {
 821   if (c_rarg2 != arg ) {
 822     masm->mov(c_rarg2, arg);
 823   }
 824 }
 825 
 826 static void pass_arg3(MacroAssembler* masm, Register arg) {
 827   if (c_rarg3 != arg ) {
 828     masm->mov(c_rarg3, arg);
 829   }
 830 }
 831 
 832 void MacroAssembler::stop(const char* msg) {
 833   address rip = pc();
 834   pusha(); // get regs on stack
 835   lea(c_rarg0, ExternalAddress((address) msg));
 836   lea(c_rarg1, InternalAddress(rip));
 837   movq(c_rarg2, rsp); // pass pointer to regs array
 838   andq(rsp, -16); // align stack as required by ABI
 839   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
 840   hlt();
 841 }
 842 
 843 void MacroAssembler::warn(const char* msg) {
 844   push(rbp);
 845   movq(rbp, rsp);
 846   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 847   push_CPU_state();   // keeps alignment at 16 bytes
 848   lea(c_rarg0, ExternalAddress((address) msg));
 849   lea(rax, ExternalAddress(CAST_FROM_FN_PTR(address, warning)));
 850   call(rax);
 851   pop_CPU_state();
 852   mov(rsp, rbp);
 853   pop(rbp);
 854 }
 855 
 856 void MacroAssembler::print_state() {
 857   address rip = pc();
 858   pusha();            // get regs on stack
 859   push(rbp);
 860   movq(rbp, rsp);
 861   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 862   push_CPU_state();   // keeps alignment at 16 bytes
 863 
 864   lea(c_rarg0, InternalAddress(rip));
 865   lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
 866   call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
 867 
 868   pop_CPU_state();
 869   mov(rsp, rbp);
 870   pop(rbp);
 871   popa();
 872 }
 873 
 874 #ifndef PRODUCT
 875 extern "C" void findpc(intptr_t x);
 876 #endif
 877 
 878 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
 879   // In order to get locks to work, we need to fake a in_VM state
 880   if (ShowMessageBoxOnError) {
 881     JavaThread* thread = JavaThread::current();
 882     JavaThreadState saved_state = thread->thread_state();
 883     thread->set_thread_state(_thread_in_vm);
 884 #ifndef PRODUCT
 885     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 886       ttyLocker ttyl;
 887       BytecodeCounter::print();
 888     }
 889 #endif
 890     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 891     // XXX correct this offset for amd64
 892     // This is the value of eip which points to where verify_oop will return.
 893     if (os::message_box(msg, "Execution stopped, print registers?")) {
 894       print_state64(pc, regs);
 895       BREAKPOINT;
 896       assert(false, "start up GDB");
 897     }
 898     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
 899   } else {
 900     ttyLocker ttyl;
 901     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
 902                     msg);
 903     assert(false, "DEBUG MESSAGE: %s", msg);
 904   }
 905 }
 906 
 907 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
 908   ttyLocker ttyl;
 909   FlagSetting fs(Debugging, true);
 910   tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
 911 #ifndef PRODUCT
 912   tty->cr();
 913   findpc(pc);
 914   tty->cr();
 915 #endif
 916 #define PRINT_REG(rax, value) \
 917   { tty->print("%s = ", #rax); os::print_location(tty, value); }
 918   PRINT_REG(rax, regs[15]);
 919   PRINT_REG(rbx, regs[12]);
 920   PRINT_REG(rcx, regs[14]);
 921   PRINT_REG(rdx, regs[13]);
 922   PRINT_REG(rdi, regs[8]);
 923   PRINT_REG(rsi, regs[9]);
 924   PRINT_REG(rbp, regs[10]);
 925   PRINT_REG(rsp, regs[11]);
 926   PRINT_REG(r8 , regs[7]);
 927   PRINT_REG(r9 , regs[6]);
 928   PRINT_REG(r10, regs[5]);
 929   PRINT_REG(r11, regs[4]);
 930   PRINT_REG(r12, regs[3]);
 931   PRINT_REG(r13, regs[2]);
 932   PRINT_REG(r14, regs[1]);
 933   PRINT_REG(r15, regs[0]);
 934 #undef PRINT_REG
 935   // Print some words near top of staack.
 936   int64_t* rsp = (int64_t*) regs[11];
 937   int64_t* dump_sp = rsp;
 938   for (int col1 = 0; col1 < 8; col1++) {
 939     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 940     os::print_location(tty, *dump_sp++);
 941   }
 942   for (int row = 0; row < 25; row++) {
 943     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 944     for (int col = 0; col < 4; col++) {
 945       tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
 946     }
 947     tty->cr();
 948   }
 949   // Print some instructions around pc:
 950   Disassembler::decode((address)pc-64, (address)pc);
 951   tty->print_cr("--------");
 952   Disassembler::decode((address)pc, (address)pc+32);
 953 }
 954 
 955 #endif // _LP64
 956 
 957 // Now versions that are common to 32/64 bit
 958 
 959 void MacroAssembler::addptr(Register dst, int32_t imm32) {
 960   LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
 961 }
 962 
 963 void MacroAssembler::addptr(Register dst, Register src) {
 964   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
 965 }
 966 
 967 void MacroAssembler::addptr(Address dst, Register src) {
 968   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
 969 }
 970 
 971 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
 972   if (reachable(src)) {
 973     Assembler::addsd(dst, as_Address(src));
 974   } else {
 975     lea(rscratch1, src);
 976     Assembler::addsd(dst, Address(rscratch1, 0));
 977   }
 978 }
 979 
 980 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
 981   if (reachable(src)) {
 982     addss(dst, as_Address(src));
 983   } else {
 984     lea(rscratch1, src);
 985     addss(dst, Address(rscratch1, 0));
 986   }
 987 }
 988 
 989 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src) {
 990   if (reachable(src)) {
 991     Assembler::addpd(dst, as_Address(src));
 992   } else {
 993     lea(rscratch1, src);
 994     Assembler::addpd(dst, Address(rscratch1, 0));
 995   }
 996 }
 997 
 998 void MacroAssembler::align(int modulus) {
 999   align(modulus, offset());
1000 }
1001 
1002 void MacroAssembler::align(int modulus, int target) {
1003   if (target % modulus != 0) {
1004     nop(modulus - (target % modulus));
1005   }
1006 }
1007 
1008 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1009   // Used in sign-masking with aligned address.
1010   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1011   if (reachable(src)) {
1012     Assembler::andpd(dst, as_Address(src));
1013   } else {
1014     lea(scratch_reg, src);
1015     Assembler::andpd(dst, Address(scratch_reg, 0));
1016   }
1017 }
1018 
1019 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
1020   // Used in sign-masking with aligned address.
1021   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1022   if (reachable(src)) {
1023     Assembler::andps(dst, as_Address(src));
1024   } else {
1025     lea(scratch_reg, src);
1026     Assembler::andps(dst, Address(scratch_reg, 0));
1027   }
1028 }
1029 
1030 void MacroAssembler::andptr(Register dst, int32_t imm32) {
1031   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1032 }
1033 
1034 void MacroAssembler::atomic_incl(Address counter_addr) {
1035   lock();
1036   incrementl(counter_addr);
1037 }
1038 
1039 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1040   if (reachable(counter_addr)) {
1041     atomic_incl(as_Address(counter_addr));
1042   } else {
1043     lea(scr, counter_addr);
1044     atomic_incl(Address(scr, 0));
1045   }
1046 }
1047 
1048 #ifdef _LP64
1049 void MacroAssembler::atomic_incq(Address counter_addr) {
1050   lock();
1051   incrementq(counter_addr);
1052 }
1053 
1054 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1055   if (reachable(counter_addr)) {
1056     atomic_incq(as_Address(counter_addr));
1057   } else {
1058     lea(scr, counter_addr);
1059     atomic_incq(Address(scr, 0));
1060   }
1061 }
1062 #endif
1063 
1064 // Writes to stack successive pages until offset reached to check for
1065 // stack overflow + shadow pages.  This clobbers tmp.
1066 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1067   movptr(tmp, rsp);
1068   // Bang stack for total size given plus shadow page size.
1069   // Bang one page at a time because large size can bang beyond yellow and
1070   // red zones.
1071   Label loop;
1072   bind(loop);
1073   movl(Address(tmp, (-os::vm_page_size())), size );
1074   subptr(tmp, os::vm_page_size());
1075   subl(size, os::vm_page_size());
1076   jcc(Assembler::greater, loop);
1077 
1078   // Bang down shadow pages too.
1079   // At this point, (tmp-0) is the last address touched, so don't
1080   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1081   // was post-decremented.)  Skip this address by starting at i=1, and
1082   // touch a few more pages below.  N.B.  It is important to touch all
1083   // the way down including all pages in the shadow zone.
1084   for (int i = 1; i < ((int)JavaThread::stack_shadow_zone_size() / os::vm_page_size()); i++) {
1085     // this could be any sized move but this is can be a debugging crumb
1086     // so the bigger the better.
1087     movptr(Address(tmp, (-i*os::vm_page_size())), size );
1088   }
1089 }
1090 
1091 void MacroAssembler::reserved_stack_check() {
1092     // testing if reserved zone needs to be enabled
1093     Label no_reserved_zone_enabling;
1094     Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1095     NOT_LP64(get_thread(rsi);)
1096 
1097     cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1098     jcc(Assembler::below, no_reserved_zone_enabling);
1099 
1100     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1101     jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1102     should_not_reach_here();
1103 
1104     bind(no_reserved_zone_enabling);
1105 }
1106 
1107 int MacroAssembler::biased_locking_enter(Register lock_reg,
1108                                          Register obj_reg,
1109                                          Register swap_reg,
1110                                          Register tmp_reg,
1111                                          bool swap_reg_contains_mark,
1112                                          Label& done,
1113                                          Label* slow_case,
1114                                          BiasedLockingCounters* counters) {
1115   assert(UseBiasedLocking, "why call this otherwise?");
1116   assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
1117   assert(tmp_reg != noreg, "tmp_reg must be supplied");
1118   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
1119   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
1120   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
1121   NOT_LP64( Address saved_mark_addr(lock_reg, 0); )
1122 
1123   if (PrintBiasedLockingStatistics && counters == NULL) {
1124     counters = BiasedLocking::counters();
1125   }
1126   // Biased locking
1127   // See whether the lock is currently biased toward our thread and
1128   // whether the epoch is still valid
1129   // Note that the runtime guarantees sufficient alignment of JavaThread
1130   // pointers to allow age to be placed into low bits
1131   // First check to see whether biasing is even enabled for this object
1132   Label cas_label;
1133   int null_check_offset = -1;
1134   if (!swap_reg_contains_mark) {
1135     null_check_offset = offset();
1136     movptr(swap_reg, mark_addr);
1137   }
1138   movptr(tmp_reg, swap_reg);
1139   andptr(tmp_reg, markOopDesc::biased_lock_mask_in_place);
1140   cmpptr(tmp_reg, markOopDesc::biased_lock_pattern);
1141   jcc(Assembler::notEqual, cas_label);
1142   // The bias pattern is present in the object's header. Need to check
1143   // whether the bias owner and the epoch are both still current.
1144 #ifndef _LP64
1145   // Note that because there is no current thread register on x86_32 we
1146   // need to store off the mark word we read out of the object to
1147   // avoid reloading it and needing to recheck invariants below. This
1148   // store is unfortunate but it makes the overall code shorter and
1149   // simpler.
1150   movptr(saved_mark_addr, swap_reg);
1151 #endif
1152   if (swap_reg_contains_mark) {
1153     null_check_offset = offset();
1154   }
1155   load_prototype_header(tmp_reg, obj_reg);
1156 #ifdef _LP64
1157   orptr(tmp_reg, r15_thread);
1158   xorptr(tmp_reg, swap_reg);
1159   Register header_reg = tmp_reg;
1160 #else
1161   xorptr(tmp_reg, swap_reg);
1162   get_thread(swap_reg);
1163   xorptr(swap_reg, tmp_reg);
1164   Register header_reg = swap_reg;
1165 #endif
1166   andptr(header_reg, ~((int) markOopDesc::age_mask_in_place));
1167   if (counters != NULL) {
1168     cond_inc32(Assembler::zero,
1169                ExternalAddress((address) counters->biased_lock_entry_count_addr()));
1170   }
1171   jcc(Assembler::equal, done);
1172 
1173   Label try_revoke_bias;
1174   Label try_rebias;
1175 
1176   // At this point we know that the header has the bias pattern and
1177   // that we are not the bias owner in the current epoch. We need to
1178   // figure out more details about the state of the header in order to
1179   // know what operations can be legally performed on the object's
1180   // header.
1181 
1182   // If the low three bits in the xor result aren't clear, that means
1183   // the prototype header is no longer biased and we have to revoke
1184   // the bias on this object.
1185   testptr(header_reg, markOopDesc::biased_lock_mask_in_place);
1186   jccb(Assembler::notZero, try_revoke_bias);
1187 
1188   // Biasing is still enabled for this data type. See whether the
1189   // epoch of the current bias is still valid, meaning that the epoch
1190   // bits of the mark word are equal to the epoch bits of the
1191   // prototype header. (Note that the prototype header's epoch bits
1192   // only change at a safepoint.) If not, attempt to rebias the object
1193   // toward the current thread. Note that we must be absolutely sure
1194   // that the current epoch is invalid in order to do this because
1195   // otherwise the manipulations it performs on the mark word are
1196   // illegal.
1197   testptr(header_reg, markOopDesc::epoch_mask_in_place);
1198   jccb(Assembler::notZero, try_rebias);
1199 
1200   // The epoch of the current bias is still valid but we know nothing
1201   // about the owner; it might be set or it might be clear. Try to
1202   // acquire the bias of the object using an atomic operation. If this
1203   // fails we will go in to the runtime to revoke the object's bias.
1204   // Note that we first construct the presumed unbiased header so we
1205   // don't accidentally blow away another thread's valid bias.
1206   NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1207   andptr(swap_reg,
1208          markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
1209 #ifdef _LP64
1210   movptr(tmp_reg, swap_reg);
1211   orptr(tmp_reg, r15_thread);
1212 #else
1213   get_thread(tmp_reg);
1214   orptr(tmp_reg, swap_reg);
1215 #endif
1216   lock();
1217   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1218   // If the biasing toward our thread failed, this means that
1219   // another thread succeeded in biasing it toward itself and we
1220   // need to revoke that bias. The revocation will occur in the
1221   // interpreter runtime in the slow case.
1222   if (counters != NULL) {
1223     cond_inc32(Assembler::zero,
1224                ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
1225   }
1226   if (slow_case != NULL) {
1227     jcc(Assembler::notZero, *slow_case);
1228   }
1229   jmp(done);
1230 
1231   bind(try_rebias);
1232   // At this point we know the epoch has expired, meaning that the
1233   // current "bias owner", if any, is actually invalid. Under these
1234   // circumstances _only_, we are allowed to use the current header's
1235   // value as the comparison value when doing the cas to acquire the
1236   // bias in the current epoch. In other words, we allow transfer of
1237   // the bias from one thread to another directly in this situation.
1238   //
1239   // FIXME: due to a lack of registers we currently blow away the age
1240   // bits in this situation. Should attempt to preserve them.
1241   load_prototype_header(tmp_reg, obj_reg);
1242 #ifdef _LP64
1243   orptr(tmp_reg, r15_thread);
1244 #else
1245   get_thread(swap_reg);
1246   orptr(tmp_reg, swap_reg);
1247   movptr(swap_reg, saved_mark_addr);
1248 #endif
1249   lock();
1250   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1251   // If the biasing toward our thread failed, then another thread
1252   // succeeded in biasing it toward itself and we need to revoke that
1253   // bias. The revocation will occur in the runtime in the slow case.
1254   if (counters != NULL) {
1255     cond_inc32(Assembler::zero,
1256                ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
1257   }
1258   if (slow_case != NULL) {
1259     jcc(Assembler::notZero, *slow_case);
1260   }
1261   jmp(done);
1262 
1263   bind(try_revoke_bias);
1264   // The prototype mark in the klass doesn't have the bias bit set any
1265   // more, indicating that objects of this data type are not supposed
1266   // to be biased any more. We are going to try to reset the mark of
1267   // this object to the prototype value and fall through to the
1268   // CAS-based locking scheme. Note that if our CAS fails, it means
1269   // that another thread raced us for the privilege of revoking the
1270   // bias of this particular object, so it's okay to continue in the
1271   // normal locking code.
1272   //
1273   // FIXME: due to a lack of registers we currently blow away the age
1274   // bits in this situation. Should attempt to preserve them.
1275   NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1276   load_prototype_header(tmp_reg, obj_reg);
1277   lock();
1278   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1279   // Fall through to the normal CAS-based lock, because no matter what
1280   // the result of the above CAS, some thread must have succeeded in
1281   // removing the bias bit from the object's header.
1282   if (counters != NULL) {
1283     cond_inc32(Assembler::zero,
1284                ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
1285   }
1286 
1287   bind(cas_label);
1288 
1289   return null_check_offset;
1290 }
1291 
1292 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1293   assert(UseBiasedLocking, "why call this otherwise?");
1294 
1295   // Check for biased locking unlock case, which is a no-op
1296   // Note: we do not have to check the thread ID for two reasons.
1297   // First, the interpreter checks for IllegalMonitorStateException at
1298   // a higher level. Second, if the bias was revoked while we held the
1299   // lock, the object could not be rebiased toward another thread, so
1300   // the bias bit would be clear.
1301   movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1302   andptr(temp_reg, markOopDesc::biased_lock_mask_in_place);
1303   cmpptr(temp_reg, markOopDesc::biased_lock_pattern);
1304   jcc(Assembler::equal, done);
1305 }
1306 
1307 #ifdef COMPILER2
1308 
1309 #if INCLUDE_RTM_OPT
1310 
1311 // Update rtm_counters based on abort status
1312 // input: abort_status
1313 //        rtm_counters (RTMLockingCounters*)
1314 // flags are killed
1315 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
1316 
1317   atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
1318   if (PrintPreciseRTMLockingStatistics) {
1319     for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
1320       Label check_abort;
1321       testl(abort_status, (1<<i));
1322       jccb(Assembler::equal, check_abort);
1323       atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
1324       bind(check_abort);
1325     }
1326   }
1327 }
1328 
1329 // Branch if (random & (count-1) != 0), count is 2^n
1330 // tmp, scr and flags are killed
1331 void MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
1332   assert(tmp == rax, "");
1333   assert(scr == rdx, "");
1334   rdtsc(); // modifies EDX:EAX
1335   andptr(tmp, count-1);
1336   jccb(Assembler::notZero, brLabel);
1337 }
1338 
1339 // Perform abort ratio calculation, set no_rtm bit if high ratio
1340 // input:  rtm_counters_Reg (RTMLockingCounters* address)
1341 // tmpReg, rtm_counters_Reg and flags are killed
1342 void MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
1343                                                  Register rtm_counters_Reg,
1344                                                  RTMLockingCounters* rtm_counters,
1345                                                  Metadata* method_data) {
1346   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
1347 
1348   if (RTMLockingCalculationDelay > 0) {
1349     // Delay calculation
1350     movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
1351     testptr(tmpReg, tmpReg);
1352     jccb(Assembler::equal, L_done);
1353   }
1354   // Abort ratio calculation only if abort_count > RTMAbortThreshold
1355   //   Aborted transactions = abort_count * 100
1356   //   All transactions = total_count *  RTMTotalCountIncrRate
1357   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
1358 
1359   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
1360   cmpptr(tmpReg, RTMAbortThreshold);
1361   jccb(Assembler::below, L_check_always_rtm2);
1362   imulptr(tmpReg, tmpReg, 100);
1363 
1364   Register scrReg = rtm_counters_Reg;
1365   movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1366   imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
1367   imulptr(scrReg, scrReg, RTMAbortRatio);
1368   cmpptr(tmpReg, scrReg);
1369   jccb(Assembler::below, L_check_always_rtm1);
1370   if (method_data != NULL) {
1371     // set rtm_state to "no rtm" in MDO
1372     mov_metadata(tmpReg, method_data);
1373     lock();
1374     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
1375   }
1376   jmpb(L_done);
1377   bind(L_check_always_rtm1);
1378   // Reload RTMLockingCounters* address
1379   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1380   bind(L_check_always_rtm2);
1381   movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1382   cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
1383   jccb(Assembler::below, L_done);
1384   if (method_data != NULL) {
1385     // set rtm_state to "always rtm" in MDO
1386     mov_metadata(tmpReg, method_data);
1387     lock();
1388     orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
1389   }
1390   bind(L_done);
1391 }
1392 
1393 // Update counters and perform abort ratio calculation
1394 // input:  abort_status_Reg
1395 // rtm_counters_Reg, flags are killed
1396 void MacroAssembler::rtm_profiling(Register abort_status_Reg,
1397                                    Register rtm_counters_Reg,
1398                                    RTMLockingCounters* rtm_counters,
1399                                    Metadata* method_data,
1400                                    bool profile_rtm) {
1401 
1402   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1403   // update rtm counters based on rax value at abort
1404   // reads abort_status_Reg, updates flags
1405   lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1406   rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
1407   if (profile_rtm) {
1408     // Save abort status because abort_status_Reg is used by following code.
1409     if (RTMRetryCount > 0) {
1410       push(abort_status_Reg);
1411     }
1412     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1413     rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
1414     // restore abort status
1415     if (RTMRetryCount > 0) {
1416       pop(abort_status_Reg);
1417     }
1418   }
1419 }
1420 
1421 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
1422 // inputs: retry_count_Reg
1423 //       : abort_status_Reg
1424 // output: retry_count_Reg decremented by 1
1425 // flags are killed
1426 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
1427   Label doneRetry;
1428   assert(abort_status_Reg == rax, "");
1429   // The abort reason bits are in eax (see all states in rtmLocking.hpp)
1430   // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
1431   // if reason is in 0x6 and retry count != 0 then retry
1432   andptr(abort_status_Reg, 0x6);
1433   jccb(Assembler::zero, doneRetry);
1434   testl(retry_count_Reg, retry_count_Reg);
1435   jccb(Assembler::zero, doneRetry);
1436   pause();
1437   decrementl(retry_count_Reg);
1438   jmp(retryLabel);
1439   bind(doneRetry);
1440 }
1441 
1442 // Spin and retry if lock is busy,
1443 // inputs: box_Reg (monitor address)
1444 //       : retry_count_Reg
1445 // output: retry_count_Reg decremented by 1
1446 //       : clear z flag if retry count exceeded
1447 // tmp_Reg, scr_Reg, flags are killed
1448 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
1449                                             Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
1450   Label SpinLoop, SpinExit, doneRetry;
1451   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1452 
1453   testl(retry_count_Reg, retry_count_Reg);
1454   jccb(Assembler::zero, doneRetry);
1455   decrementl(retry_count_Reg);
1456   movptr(scr_Reg, RTMSpinLoopCount);
1457 
1458   bind(SpinLoop);
1459   pause();
1460   decrementl(scr_Reg);
1461   jccb(Assembler::lessEqual, SpinExit);
1462   movptr(tmp_Reg, Address(box_Reg, owner_offset));
1463   testptr(tmp_Reg, tmp_Reg);
1464   jccb(Assembler::notZero, SpinLoop);
1465 
1466   bind(SpinExit);
1467   jmp(retryLabel);
1468   bind(doneRetry);
1469   incrementl(retry_count_Reg); // clear z flag
1470 }
1471 
1472 // Use RTM for normal stack locks
1473 // Input: objReg (object to lock)
1474 void MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
1475                                        Register retry_on_abort_count_Reg,
1476                                        RTMLockingCounters* stack_rtm_counters,
1477                                        Metadata* method_data, bool profile_rtm,
1478                                        Label& DONE_LABEL, Label& IsInflated) {
1479   assert(UseRTMForStackLocks, "why call this otherwise?");
1480   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1481   assert(tmpReg == rax, "");
1482   assert(scrReg == rdx, "");
1483   Label L_rtm_retry, L_decrement_retry, L_on_abort;
1484 
1485   if (RTMRetryCount > 0) {
1486     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1487     bind(L_rtm_retry);
1488   }
1489   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
1490   testptr(tmpReg, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
1491   jcc(Assembler::notZero, IsInflated);
1492 
1493   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1494     Label L_noincrement;
1495     if (RTMTotalCountIncrRate > 1) {
1496       // tmpReg, scrReg and flags are killed
1497       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
1498     }
1499     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
1500     atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
1501     bind(L_noincrement);
1502   }
1503   xbegin(L_on_abort);
1504   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));       // fetch markword
1505   andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
1506   cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
1507   jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
1508 
1509   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1510   if (UseRTMXendForLockBusy) {
1511     xend();
1512     movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
1513     jmp(L_decrement_retry);
1514   }
1515   else {
1516     xabort(0);
1517   }
1518   bind(L_on_abort);
1519   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1520     rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
1521   }
1522   bind(L_decrement_retry);
1523   if (RTMRetryCount > 0) {
1524     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1525     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1526   }
1527 }
1528 
1529 // Use RTM for inflating locks
1530 // inputs: objReg (object to lock)
1531 //         boxReg (on-stack box address (displaced header location) - KILLED)
1532 //         tmpReg (ObjectMonitor address + markOopDesc::monitor_value)
1533 void MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
1534                                           Register scrReg, Register retry_on_busy_count_Reg,
1535                                           Register retry_on_abort_count_Reg,
1536                                           RTMLockingCounters* rtm_counters,
1537                                           Metadata* method_data, bool profile_rtm,
1538                                           Label& DONE_LABEL) {
1539   assert(UseRTMLocking, "why call this otherwise?");
1540   assert(tmpReg == rax, "");
1541   assert(scrReg == rdx, "");
1542   Label L_rtm_retry, L_decrement_retry, L_on_abort;
1543   int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1544 
1545   // Without cast to int32_t a movptr will destroy r10 which is typically obj
1546   movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1547   movptr(boxReg, tmpReg); // Save ObjectMonitor address
1548 
1549   if (RTMRetryCount > 0) {
1550     movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
1551     movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1552     bind(L_rtm_retry);
1553   }
1554   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1555     Label L_noincrement;
1556     if (RTMTotalCountIncrRate > 1) {
1557       // tmpReg, scrReg and flags are killed
1558       branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
1559     }
1560     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1561     atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
1562     bind(L_noincrement);
1563   }
1564   xbegin(L_on_abort);
1565   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
1566   movptr(tmpReg, Address(tmpReg, owner_offset));
1567   testptr(tmpReg, tmpReg);
1568   jcc(Assembler::zero, DONE_LABEL);
1569   if (UseRTMXendForLockBusy) {
1570     xend();
1571     jmp(L_decrement_retry);
1572   }
1573   else {
1574     xabort(0);
1575   }
1576   bind(L_on_abort);
1577   Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1578   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1579     rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
1580   }
1581   if (RTMRetryCount > 0) {
1582     // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1583     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1584   }
1585 
1586   movptr(tmpReg, Address(boxReg, owner_offset)) ;
1587   testptr(tmpReg, tmpReg) ;
1588   jccb(Assembler::notZero, L_decrement_retry) ;
1589 
1590   // Appears unlocked - try to swing _owner from null to non-null.
1591   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1592 #ifdef _LP64
1593   Register threadReg = r15_thread;
1594 #else
1595   get_thread(scrReg);
1596   Register threadReg = scrReg;
1597 #endif
1598   lock();
1599   cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
1600 
1601   if (RTMRetryCount > 0) {
1602     // success done else retry
1603     jccb(Assembler::equal, DONE_LABEL) ;
1604     bind(L_decrement_retry);
1605     // Spin and retry if lock is busy.
1606     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
1607   }
1608   else {
1609     bind(L_decrement_retry);
1610   }
1611 }
1612 
1613 #endif //  INCLUDE_RTM_OPT
1614 
1615 // Fast_Lock and Fast_Unlock used by C2
1616 
1617 // Because the transitions from emitted code to the runtime
1618 // monitorenter/exit helper stubs are so slow it's critical that
1619 // we inline both the stack-locking fast-path and the inflated fast path.
1620 //
1621 // See also: cmpFastLock and cmpFastUnlock.
1622 //
1623 // What follows is a specialized inline transliteration of the code
1624 // in slow_enter() and slow_exit().  If we're concerned about I$ bloat
1625 // another option would be to emit TrySlowEnter and TrySlowExit methods
1626 // at startup-time.  These methods would accept arguments as
1627 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
1628 // indications in the icc.ZFlag.  Fast_Lock and Fast_Unlock would simply
1629 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
1630 // In practice, however, the # of lock sites is bounded and is usually small.
1631 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
1632 // if the processor uses simple bimodal branch predictors keyed by EIP
1633 // Since the helper routines would be called from multiple synchronization
1634 // sites.
1635 //
1636 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
1637 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
1638 // to those specialized methods.  That'd give us a mostly platform-independent
1639 // implementation that the JITs could optimize and inline at their pleasure.
1640 // Done correctly, the only time we'd need to cross to native could would be
1641 // to park() or unpark() threads.  We'd also need a few more unsafe operators
1642 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
1643 // (b) explicit barriers or fence operations.
1644 //
1645 // TODO:
1646 //
1647 // *  Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
1648 //    This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
1649 //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
1650 //    the lock operators would typically be faster than reifying Self.
1651 //
1652 // *  Ideally I'd define the primitives as:
1653 //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
1654 //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
1655 //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
1656 //    Instead, we're stuck with a rather awkward and brittle register assignments below.
1657 //    Furthermore the register assignments are overconstrained, possibly resulting in
1658 //    sub-optimal code near the synchronization site.
1659 //
1660 // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
1661 //    Alternately, use a better sp-proximity test.
1662 //
1663 // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
1664 //    Either one is sufficient to uniquely identify a thread.
1665 //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
1666 //
1667 // *  Intrinsify notify() and notifyAll() for the common cases where the
1668 //    object is locked by the calling thread but the waitlist is empty.
1669 //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
1670 //
1671 // *  use jccb and jmpb instead of jcc and jmp to improve code density.
1672 //    But beware of excessive branch density on AMD Opterons.
1673 //
1674 // *  Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
1675 //    or failure of the fast-path.  If the fast-path fails then we pass
1676 //    control to the slow-path, typically in C.  In Fast_Lock and
1677 //    Fast_Unlock we often branch to DONE_LABEL, just to find that C2
1678 //    will emit a conditional branch immediately after the node.
1679 //    So we have branches to branches and lots of ICC.ZF games.
1680 //    Instead, it might be better to have C2 pass a "FailureLabel"
1681 //    into Fast_Lock and Fast_Unlock.  In the case of success, control
1682 //    will drop through the node.  ICC.ZF is undefined at exit.
1683 //    In the case of failure, the node will branch directly to the
1684 //    FailureLabel
1685 
1686 
1687 // obj: object to lock
1688 // box: on-stack box address (displaced header location) - KILLED
1689 // rax,: tmp -- KILLED
1690 // scr: tmp -- KILLED
1691 void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
1692                                Register scrReg, Register cx1Reg, Register cx2Reg,
1693                                BiasedLockingCounters* counters,
1694                                RTMLockingCounters* rtm_counters,
1695                                RTMLockingCounters* stack_rtm_counters,
1696                                Metadata* method_data,
1697                                bool use_rtm, bool profile_rtm) {
1698   // Ensure the register assignments are disjoint
1699   assert(tmpReg == rax, "");
1700 
1701   if (use_rtm) {
1702     assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
1703   } else {
1704     assert(cx1Reg == noreg, "");
1705     assert(cx2Reg == noreg, "");
1706     assert_different_registers(objReg, boxReg, tmpReg, scrReg);
1707   }
1708 
1709   if (counters != NULL) {
1710     atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
1711   }
1712 
1713   // Possible cases that we'll encounter in fast_lock
1714   // ------------------------------------------------
1715   // * Inflated
1716   //    -- unlocked
1717   //    -- Locked
1718   //       = by self
1719   //       = by other
1720   // * biased
1721   //    -- by Self
1722   //    -- by other
1723   // * neutral
1724   // * stack-locked
1725   //    -- by self
1726   //       = sp-proximity test hits
1727   //       = sp-proximity test generates false-negative
1728   //    -- by other
1729   //
1730 
1731   Label IsInflated, DONE_LABEL;
1732 
1733   // it's stack-locked, biased or neutral
1734   // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
1735   // order to reduce the number of conditional branches in the most common cases.
1736   // Beware -- there's a subtle invariant that fetch of the markword
1737   // at [FETCH], below, will never observe a biased encoding (*101b).
1738   // If this invariant is not held we risk exclusion (safety) failure.
1739   if (UseBiasedLocking && !UseOptoBiasInlining) {
1740     biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
1741   }
1742 
1743 #if INCLUDE_RTM_OPT
1744   if (UseRTMForStackLocks && use_rtm) {
1745     rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
1746                       stack_rtm_counters, method_data, profile_rtm,
1747                       DONE_LABEL, IsInflated);
1748   }
1749 #endif // INCLUDE_RTM_OPT
1750 
1751   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));          // [FETCH]
1752   testptr(tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased
1753   jccb(Assembler::notZero, IsInflated);
1754 
1755   // Attempt stack-locking ...
1756   orptr (tmpReg, markOopDesc::unlocked_value);
1757   movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
1758   lock();
1759   cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes()));      // Updates tmpReg
1760   if (counters != NULL) {
1761     cond_inc32(Assembler::equal,
1762                ExternalAddress((address)counters->fast_path_entry_count_addr()));
1763   }
1764   jcc(Assembler::equal, DONE_LABEL);           // Success
1765 
1766   // Recursive locking.
1767   // The object is stack-locked: markword contains stack pointer to BasicLock.
1768   // Locked by current thread if difference with current SP is less than one page.
1769   subptr(tmpReg, rsp);
1770   // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
1771   andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
1772   movptr(Address(boxReg, 0), tmpReg);
1773   if (counters != NULL) {
1774     cond_inc32(Assembler::equal,
1775                ExternalAddress((address)counters->fast_path_entry_count_addr()));
1776   }
1777   jmp(DONE_LABEL);
1778 
1779   bind(IsInflated);
1780   // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markOopDesc::monitor_value
1781 
1782 #if INCLUDE_RTM_OPT
1783   // Use the same RTM locking code in 32- and 64-bit VM.
1784   if (use_rtm) {
1785     rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
1786                          rtm_counters, method_data, profile_rtm, DONE_LABEL);
1787   } else {
1788 #endif // INCLUDE_RTM_OPT
1789 
1790 #ifndef _LP64
1791   // The object is inflated.
1792 
1793   // boxReg refers to the on-stack BasicLock in the current frame.
1794   // We'd like to write:
1795   //   set box->_displaced_header = markOopDesc::unused_mark().  Any non-0 value suffices.
1796   // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
1797   // additional latency as we have another ST in the store buffer that must drain.
1798 
1799   // avoid ST-before-CAS
1800   // register juggle because we need tmpReg for cmpxchgptr below
1801   movptr(scrReg, boxReg);
1802   movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
1803 
1804   // Optimistic form: consider XORL tmpReg,tmpReg
1805   movptr(tmpReg, NULL_WORD);
1806 
1807   // Appears unlocked - try to swing _owner from null to non-null.
1808   // Ideally, I'd manifest "Self" with get_thread and then attempt
1809   // to CAS the register containing Self into m->Owner.
1810   // But we don't have enough registers, so instead we can either try to CAS
1811   // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
1812   // we later store "Self" into m->Owner.  Transiently storing a stack address
1813   // (rsp or the address of the box) into  m->owner is harmless.
1814   // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1815   lock();
1816   cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1817   movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
1818   // If we weren't able to swing _owner from NULL to the BasicLock
1819   // then take the slow path.
1820   jccb  (Assembler::notZero, DONE_LABEL);
1821   // update _owner from BasicLock to thread
1822   get_thread (scrReg);                    // beware: clobbers ICCs
1823   movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
1824   xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
1825 
1826   // If the CAS fails we can either retry or pass control to the slow-path.
1827   // We use the latter tactic.
1828   // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1829   // If the CAS was successful ...
1830   //   Self has acquired the lock
1831   //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1832   // Intentional fall-through into DONE_LABEL ...
1833 #else // _LP64
1834   // It's inflated
1835   movq(scrReg, tmpReg);
1836   xorq(tmpReg, tmpReg);
1837 
1838   lock();
1839   cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1840   // Unconditionally set box->_displaced_header = markOopDesc::unused_mark().
1841   // Without cast to int32_t movptr will destroy r10 which is typically obj.
1842   movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1843   // Intentional fall-through into DONE_LABEL ...
1844   // Propagate ICC.ZF from CAS above into DONE_LABEL.
1845 #endif // _LP64
1846 #if INCLUDE_RTM_OPT
1847   } // use_rtm()
1848 #endif
1849   // DONE_LABEL is a hot target - we'd really like to place it at the
1850   // start of cache line by padding with NOPs.
1851   // See the AMD and Intel software optimization manuals for the
1852   // most efficient "long" NOP encodings.
1853   // Unfortunately none of our alignment mechanisms suffice.
1854   bind(DONE_LABEL);
1855 
1856   // At DONE_LABEL the icc ZFlag is set as follows ...
1857   // Fast_Unlock uses the same protocol.
1858   // ZFlag == 1 -> Success
1859   // ZFlag == 0 -> Failure - force control through the slow-path
1860 }
1861 
1862 // obj: object to unlock
1863 // box: box address (displaced header location), killed.  Must be EAX.
1864 // tmp: killed, cannot be obj nor box.
1865 //
1866 // Some commentary on balanced locking:
1867 //
1868 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
1869 // Methods that don't have provably balanced locking are forced to run in the
1870 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1871 // The interpreter provides two properties:
1872 // I1:  At return-time the interpreter automatically and quietly unlocks any
1873 //      objects acquired the current activation (frame).  Recall that the
1874 //      interpreter maintains an on-stack list of locks currently held by
1875 //      a frame.
1876 // I2:  If a method attempts to unlock an object that is not held by the
1877 //      the frame the interpreter throws IMSX.
1878 //
1879 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
1880 // B() doesn't have provably balanced locking so it runs in the interpreter.
1881 // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
1882 // is still locked by A().
1883 //
1884 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
1885 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
1886 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
1887 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
1888 // Arguably given that the spec legislates the JNI case as undefined our implementation
1889 // could reasonably *avoid* checking owner in Fast_Unlock().
1890 // In the interest of performance we elide m->Owner==Self check in unlock.
1891 // A perfectly viable alternative is to elide the owner check except when
1892 // Xcheck:jni is enabled.
1893 
1894 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
1895   assert(boxReg == rax, "");
1896   assert_different_registers(objReg, boxReg, tmpReg);
1897 
1898   Label DONE_LABEL, Stacked, CheckSucc;
1899 
1900   // Critically, the biased locking test must have precedence over
1901   // and appear before the (box->dhw == 0) recursive stack-lock test.
1902   if (UseBiasedLocking && !UseOptoBiasInlining) {
1903     biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1904   }
1905 
1906 #if INCLUDE_RTM_OPT
1907   if (UseRTMForStackLocks && use_rtm) {
1908     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1909     Label L_regular_unlock;
1910     movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));           // fetch markword
1911     andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
1912     cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
1913     jccb(Assembler::notEqual, L_regular_unlock);  // if !HLE RegularLock
1914     xend();                                       // otherwise end...
1915     jmp(DONE_LABEL);                              // ... and we're done
1916     bind(L_regular_unlock);
1917   }
1918 #endif
1919 
1920   cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
1921   jcc   (Assembler::zero, DONE_LABEL);            // 0 indicates recursive stack-lock
1922   movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));             // Examine the object's markword
1923   testptr(tmpReg, markOopDesc::monitor_value);    // Inflated?
1924   jccb  (Assembler::zero, Stacked);
1925 
1926   // It's inflated.
1927 #if INCLUDE_RTM_OPT
1928   if (use_rtm) {
1929     Label L_regular_inflated_unlock;
1930     int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1931     movptr(boxReg, Address(tmpReg, owner_offset));
1932     testptr(boxReg, boxReg);
1933     jccb(Assembler::notZero, L_regular_inflated_unlock);
1934     xend();
1935     jmpb(DONE_LABEL);
1936     bind(L_regular_inflated_unlock);
1937   }
1938 #endif
1939 
1940   // Despite our balanced locking property we still check that m->_owner == Self
1941   // as java routines or native JNI code called by this thread might
1942   // have released the lock.
1943   // Refer to the comments in synchronizer.cpp for how we might encode extra
1944   // state in _succ so we can avoid fetching EntryList|cxq.
1945   //
1946   // I'd like to add more cases in fast_lock() and fast_unlock() --
1947   // such as recursive enter and exit -- but we have to be wary of
1948   // I$ bloat, T$ effects and BP$ effects.
1949   //
1950   // If there's no contention try a 1-0 exit.  That is, exit without
1951   // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
1952   // we detect and recover from the race that the 1-0 exit admits.
1953   //
1954   // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
1955   // before it STs null into _owner, releasing the lock.  Updates
1956   // to data protected by the critical section must be visible before
1957   // we drop the lock (and thus before any other thread could acquire
1958   // the lock and observe the fields protected by the lock).
1959   // IA32's memory-model is SPO, so STs are ordered with respect to
1960   // each other and there's no need for an explicit barrier (fence).
1961   // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
1962 #ifndef _LP64
1963   get_thread (boxReg);
1964 
1965   // Note that we could employ various encoding schemes to reduce
1966   // the number of loads below (currently 4) to just 2 or 3.
1967   // Refer to the comments in synchronizer.cpp.
1968   // In practice the chain of fetches doesn't seem to impact performance, however.
1969   xorptr(boxReg, boxReg);
1970   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1971   jccb  (Assembler::notZero, DONE_LABEL);
1972   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1973   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1974   jccb  (Assembler::notZero, CheckSucc);
1975   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1976   jmpb  (DONE_LABEL);
1977 
1978   bind (Stacked);
1979   // It's not inflated and it's not recursively stack-locked and it's not biased.
1980   // It must be stack-locked.
1981   // Try to reset the header to displaced header.
1982   // The "box" value on the stack is stable, so we can reload
1983   // and be assured we observe the same value as above.
1984   movptr(tmpReg, Address(boxReg, 0));
1985   lock();
1986   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
1987   // Intention fall-thru into DONE_LABEL
1988 
1989   // DONE_LABEL is a hot target - we'd really like to place it at the
1990   // start of cache line by padding with NOPs.
1991   // See the AMD and Intel software optimization manuals for the
1992   // most efficient "long" NOP encodings.
1993   // Unfortunately none of our alignment mechanisms suffice.
1994   bind (CheckSucc);
1995 #else // _LP64
1996   // It's inflated
1997   xorptr(boxReg, boxReg);
1998   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1999   jccb  (Assembler::notZero, DONE_LABEL);
2000   movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2001   orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2002   jccb  (Assembler::notZero, CheckSucc);
2003   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2004   jmpb  (DONE_LABEL);
2005 
2006   // Try to avoid passing control into the slow_path ...
2007   Label LSuccess, LGoSlowPath ;
2008   bind  (CheckSucc);
2009 
2010   // The following optional optimization can be elided if necessary
2011   // Effectively: if (succ == null) goto SlowPath
2012   // The code reduces the window for a race, however,
2013   // and thus benefits performance.
2014   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2015   jccb  (Assembler::zero, LGoSlowPath);
2016 
2017   xorptr(boxReg, boxReg);
2018   movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2019 
2020   // Memory barrier/fence
2021   // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
2022   // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
2023   // This is faster on Nehalem and AMD Shanghai/Barcelona.
2024   // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2025   // We might also restructure (ST Owner=0;barrier;LD _Succ) to
2026   // (mov box,0; xchgq box, &m->Owner; LD _succ) .
2027   lock(); addl(Address(rsp, 0), 0);
2028 
2029   cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2030   jccb  (Assembler::notZero, LSuccess);
2031 
2032   // Rare inopportune interleaving - race.
2033   // The successor vanished in the small window above.
2034   // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
2035   // We need to ensure progress and succession.
2036   // Try to reacquire the lock.
2037   // If that fails then the new owner is responsible for succession and this
2038   // thread needs to take no further action and can exit via the fast path (success).
2039   // If the re-acquire succeeds then pass control into the slow path.
2040   // As implemented, this latter mode is horrible because we generated more
2041   // coherence traffic on the lock *and* artifically extended the critical section
2042   // length while by virtue of passing control into the slow path.
2043 
2044   // box is really RAX -- the following CMPXCHG depends on that binding
2045   // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
2046   lock();
2047   cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2048   // There's no successor so we tried to regrab the lock.
2049   // If that didn't work, then another thread grabbed the
2050   // lock so we're done (and exit was a success).
2051   jccb  (Assembler::notEqual, LSuccess);
2052   // Intentional fall-through into slow-path
2053 
2054   bind  (LGoSlowPath);
2055   orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2056   jmpb  (DONE_LABEL);
2057 
2058   bind  (LSuccess);
2059   testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
2060   jmpb  (DONE_LABEL);
2061 
2062   bind  (Stacked);
2063   movptr(tmpReg, Address (boxReg, 0));      // re-fetch
2064   lock();
2065   cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
2066 
2067 #endif
2068   bind(DONE_LABEL);
2069 }
2070 #endif // COMPILER2
2071 
2072 void MacroAssembler::c2bool(Register x) {
2073   // implements x == 0 ? 0 : 1
2074   // note: must only look at least-significant byte of x
2075   //       since C-style booleans are stored in one byte
2076   //       only! (was bug)
2077   andl(x, 0xFF);
2078   setb(Assembler::notZero, x);
2079 }
2080 
2081 // Wouldn't need if AddressLiteral version had new name
2082 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
2083   Assembler::call(L, rtype);
2084 }
2085 
2086 void MacroAssembler::call(Register entry) {
2087   Assembler::call(entry);
2088 }
2089 
2090 void MacroAssembler::call(AddressLiteral entry) {
2091   if (reachable(entry)) {
2092     Assembler::call_literal(entry.target(), entry.rspec());
2093   } else {
2094     lea(rscratch1, entry);
2095     Assembler::call(rscratch1);
2096   }
2097 }
2098 
2099 void MacroAssembler::ic_call(address entry, jint method_index) {
2100   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
2101   movptr(rax, (intptr_t)Universe::non_oop_word());
2102   call(AddressLiteral(entry, rh));
2103 }
2104 
2105 // Implementation of call_VM versions
2106 
2107 void MacroAssembler::call_VM(Register oop_result,
2108                              address entry_point,
2109                              bool check_exceptions) {
2110   Label C, E;
2111   call(C, relocInfo::none);
2112   jmp(E);
2113 
2114   bind(C);
2115   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
2116   ret(0);
2117 
2118   bind(E);
2119 }
2120 
2121 void MacroAssembler::call_VM(Register oop_result,
2122                              address entry_point,
2123                              Register arg_1,
2124                              bool check_exceptions) {
2125   Label C, E;
2126   call(C, relocInfo::none);
2127   jmp(E);
2128 
2129   bind(C);
2130   pass_arg1(this, arg_1);
2131   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
2132   ret(0);
2133 
2134   bind(E);
2135 }
2136 
2137 void MacroAssembler::call_VM(Register oop_result,
2138                              address entry_point,
2139                              Register arg_1,
2140                              Register arg_2,
2141                              bool check_exceptions) {
2142   Label C, E;
2143   call(C, relocInfo::none);
2144   jmp(E);
2145 
2146   bind(C);
2147 
2148   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2149 
2150   pass_arg2(this, arg_2);
2151   pass_arg1(this, arg_1);
2152   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
2153   ret(0);
2154 
2155   bind(E);
2156 }
2157 
2158 void MacroAssembler::call_VM(Register oop_result,
2159                              address entry_point,
2160                              Register arg_1,
2161                              Register arg_2,
2162                              Register arg_3,
2163                              bool check_exceptions) {
2164   Label C, E;
2165   call(C, relocInfo::none);
2166   jmp(E);
2167 
2168   bind(C);
2169 
2170   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2171   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2172   pass_arg3(this, arg_3);
2173 
2174   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2175   pass_arg2(this, arg_2);
2176 
2177   pass_arg1(this, arg_1);
2178   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
2179   ret(0);
2180 
2181   bind(E);
2182 }
2183 
2184 void MacroAssembler::call_VM(Register oop_result,
2185                              Register last_java_sp,
2186                              address entry_point,
2187                              int number_of_arguments,
2188                              bool check_exceptions) {
2189   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2190   call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2191 }
2192 
2193 void MacroAssembler::call_VM(Register oop_result,
2194                              Register last_java_sp,
2195                              address entry_point,
2196                              Register arg_1,
2197                              bool check_exceptions) {
2198   pass_arg1(this, arg_1);
2199   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2200 }
2201 
2202 void MacroAssembler::call_VM(Register oop_result,
2203                              Register last_java_sp,
2204                              address entry_point,
2205                              Register arg_1,
2206                              Register arg_2,
2207                              bool check_exceptions) {
2208 
2209   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2210   pass_arg2(this, arg_2);
2211   pass_arg1(this, arg_1);
2212   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2213 }
2214 
2215 void MacroAssembler::call_VM(Register oop_result,
2216                              Register last_java_sp,
2217                              address entry_point,
2218                              Register arg_1,
2219                              Register arg_2,
2220                              Register arg_3,
2221                              bool check_exceptions) {
2222   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2223   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2224   pass_arg3(this, arg_3);
2225   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2226   pass_arg2(this, arg_2);
2227   pass_arg1(this, arg_1);
2228   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2229 }
2230 
2231 void MacroAssembler::super_call_VM(Register oop_result,
2232                                    Register last_java_sp,
2233                                    address entry_point,
2234                                    int number_of_arguments,
2235                                    bool check_exceptions) {
2236   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2237   MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2238 }
2239 
2240 void MacroAssembler::super_call_VM(Register oop_result,
2241                                    Register last_java_sp,
2242                                    address entry_point,
2243                                    Register arg_1,
2244                                    bool check_exceptions) {
2245   pass_arg1(this, arg_1);
2246   super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2247 }
2248 
2249 void MacroAssembler::super_call_VM(Register oop_result,
2250                                    Register last_java_sp,
2251                                    address entry_point,
2252                                    Register arg_1,
2253                                    Register arg_2,
2254                                    bool check_exceptions) {
2255 
2256   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2257   pass_arg2(this, arg_2);
2258   pass_arg1(this, arg_1);
2259   super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2260 }
2261 
2262 void MacroAssembler::super_call_VM(Register oop_result,
2263                                    Register last_java_sp,
2264                                    address entry_point,
2265                                    Register arg_1,
2266                                    Register arg_2,
2267                                    Register arg_3,
2268                                    bool check_exceptions) {
2269   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2270   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2271   pass_arg3(this, arg_3);
2272   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2273   pass_arg2(this, arg_2);
2274   pass_arg1(this, arg_1);
2275   super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2276 }
2277 
2278 void MacroAssembler::call_VM_base(Register oop_result,
2279                                   Register java_thread,
2280                                   Register last_java_sp,
2281                                   address  entry_point,
2282                                   int      number_of_arguments,
2283                                   bool     check_exceptions) {
2284   // determine java_thread register
2285   if (!java_thread->is_valid()) {
2286 #ifdef _LP64
2287     java_thread = r15_thread;
2288 #else
2289     java_thread = rdi;
2290     get_thread(java_thread);
2291 #endif // LP64
2292   }
2293   // determine last_java_sp register
2294   if (!last_java_sp->is_valid()) {
2295     last_java_sp = rsp;
2296   }
2297   // debugging support
2298   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
2299   LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
2300 #ifdef ASSERT
2301   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
2302   // r12 is the heapbase.
2303   LP64_ONLY(if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
2304 #endif // ASSERT
2305 
2306   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
2307   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
2308 
2309   // push java thread (becomes first argument of C function)
2310 
2311   NOT_LP64(push(java_thread); number_of_arguments++);
2312   LP64_ONLY(mov(c_rarg0, r15_thread));
2313 
2314   // set last Java frame before call
2315   assert(last_java_sp != rbp, "can't use ebp/rbp");
2316 
2317   // Only interpreter should have to set fp
2318   set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
2319 
2320   // do the call, remove parameters
2321   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
2322 
2323   // restore the thread (cannot use the pushed argument since arguments
2324   // may be overwritten by C code generated by an optimizing compiler);
2325   // however can use the register value directly if it is callee saved.
2326   if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
2327     // rdi & rsi (also r15) are callee saved -> nothing to do
2328 #ifdef ASSERT
2329     guarantee(java_thread != rax, "change this code");
2330     push(rax);
2331     { Label L;
2332       get_thread(rax);
2333       cmpptr(java_thread, rax);
2334       jcc(Assembler::equal, L);
2335       STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
2336       bind(L);
2337     }
2338     pop(rax);
2339 #endif
2340   } else {
2341     get_thread(java_thread);
2342   }
2343   // reset last Java frame
2344   // Only interpreter should have to clear fp
2345   reset_last_Java_frame(java_thread, true);
2346 
2347    // C++ interp handles this in the interpreter
2348   check_and_handle_popframe(java_thread);
2349   check_and_handle_earlyret(java_thread);
2350 
2351   if (check_exceptions) {
2352     // check for pending exceptions (java_thread is set upon return)
2353     cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
2354 #ifndef _LP64
2355     jump_cc(Assembler::notEqual,
2356             RuntimeAddress(StubRoutines::forward_exception_entry()));
2357 #else
2358     // This used to conditionally jump to forward_exception however it is
2359     // possible if we relocate that the branch will not reach. So we must jump
2360     // around so we can always reach
2361 
2362     Label ok;
2363     jcc(Assembler::equal, ok);
2364     jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2365     bind(ok);
2366 #endif // LP64
2367   }
2368 
2369   // get oop result if there is one and reset the value in the thread
2370   if (oop_result->is_valid()) {
2371     get_vm_result(oop_result, java_thread);
2372   }
2373 }
2374 
2375 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
2376 
2377   // Calculate the value for last_Java_sp
2378   // somewhat subtle. call_VM does an intermediate call
2379   // which places a return address on the stack just under the
2380   // stack pointer as the user finsihed with it. This allows
2381   // use to retrieve last_Java_pc from last_Java_sp[-1].
2382   // On 32bit we then have to push additional args on the stack to accomplish
2383   // the actual requested call. On 64bit call_VM only can use register args
2384   // so the only extra space is the return address that call_VM created.
2385   // This hopefully explains the calculations here.
2386 
2387 #ifdef _LP64
2388   // We've pushed one address, correct last_Java_sp
2389   lea(rax, Address(rsp, wordSize));
2390 #else
2391   lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
2392 #endif // LP64
2393 
2394   call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
2395 
2396 }
2397 
2398 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
2399 void MacroAssembler::call_VM_leaf0(address entry_point) {
2400   MacroAssembler::call_VM_leaf_base(entry_point, 0);
2401 }
2402 
2403 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
2404   call_VM_leaf_base(entry_point, number_of_arguments);
2405 }
2406 
2407 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
2408   pass_arg0(this, arg_0);
2409   call_VM_leaf(entry_point, 1);
2410 }
2411 
2412 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2413 
2414   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2415   pass_arg1(this, arg_1);
2416   pass_arg0(this, arg_0);
2417   call_VM_leaf(entry_point, 2);
2418 }
2419 
2420 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2421   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2422   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2423   pass_arg2(this, arg_2);
2424   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2425   pass_arg1(this, arg_1);
2426   pass_arg0(this, arg_0);
2427   call_VM_leaf(entry_point, 3);
2428 }
2429 
2430 void MacroAssembler::super_call_VM_leaf(address entry_point) {
2431   MacroAssembler::call_VM_leaf_base(entry_point, 1);
2432 }
2433 
2434 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
2435   pass_arg0(this, arg_0);
2436   MacroAssembler::call_VM_leaf_base(entry_point, 1);
2437 }
2438 
2439 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2440 
2441   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2442   pass_arg1(this, arg_1);
2443   pass_arg0(this, arg_0);
2444   MacroAssembler::call_VM_leaf_base(entry_point, 2);
2445 }
2446 
2447 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2448   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2449   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2450   pass_arg2(this, arg_2);
2451   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2452   pass_arg1(this, arg_1);
2453   pass_arg0(this, arg_0);
2454   MacroAssembler::call_VM_leaf_base(entry_point, 3);
2455 }
2456 
2457 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
2458   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
2459   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2460   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2461   pass_arg3(this, arg_3);
2462   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2463   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2464   pass_arg2(this, arg_2);
2465   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2466   pass_arg1(this, arg_1);
2467   pass_arg0(this, arg_0);
2468   MacroAssembler::call_VM_leaf_base(entry_point, 4);
2469 }
2470 
2471 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
2472   movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
2473   movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
2474   verify_oop(oop_result, "broken oop in call_VM_base");
2475 }
2476 
2477 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
2478   movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
2479   movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
2480 }
2481 
2482 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
2483 }
2484 
2485 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
2486 }
2487 
2488 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
2489   if (reachable(src1)) {
2490     cmpl(as_Address(src1), imm);
2491   } else {
2492     lea(rscratch1, src1);
2493     cmpl(Address(rscratch1, 0), imm);
2494   }
2495 }
2496 
2497 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
2498   assert(!src2.is_lval(), "use cmpptr");
2499   if (reachable(src2)) {
2500     cmpl(src1, as_Address(src2));
2501   } else {
2502     lea(rscratch1, src2);
2503     cmpl(src1, Address(rscratch1, 0));
2504   }
2505 }
2506 
2507 void MacroAssembler::cmp32(Register src1, int32_t imm) {
2508   Assembler::cmpl(src1, imm);
2509 }
2510 
2511 void MacroAssembler::cmp32(Register src1, Address src2) {
2512   Assembler::cmpl(src1, src2);
2513 }
2514 
2515 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2516   ucomisd(opr1, opr2);
2517 
2518   Label L;
2519   if (unordered_is_less) {
2520     movl(dst, -1);
2521     jcc(Assembler::parity, L);
2522     jcc(Assembler::below , L);
2523     movl(dst, 0);
2524     jcc(Assembler::equal , L);
2525     increment(dst);
2526   } else { // unordered is greater
2527     movl(dst, 1);
2528     jcc(Assembler::parity, L);
2529     jcc(Assembler::above , L);
2530     movl(dst, 0);
2531     jcc(Assembler::equal , L);
2532     decrementl(dst);
2533   }
2534   bind(L);
2535 }
2536 
2537 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2538   ucomiss(opr1, opr2);
2539 
2540   Label L;
2541   if (unordered_is_less) {
2542     movl(dst, -1);
2543     jcc(Assembler::parity, L);
2544     jcc(Assembler::below , L);
2545     movl(dst, 0);
2546     jcc(Assembler::equal , L);
2547     increment(dst);
2548   } else { // unordered is greater
2549     movl(dst, 1);
2550     jcc(Assembler::parity, L);
2551     jcc(Assembler::above , L);
2552     movl(dst, 0);
2553     jcc(Assembler::equal , L);
2554     decrementl(dst);
2555   }
2556   bind(L);
2557 }
2558 
2559 
2560 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
2561   if (reachable(src1)) {
2562     cmpb(as_Address(src1), imm);
2563   } else {
2564     lea(rscratch1, src1);
2565     cmpb(Address(rscratch1, 0), imm);
2566   }
2567 }
2568 
2569 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
2570 #ifdef _LP64
2571   if (src2.is_lval()) {
2572     movptr(rscratch1, src2);
2573     Assembler::cmpq(src1, rscratch1);
2574   } else if (reachable(src2)) {
2575     cmpq(src1, as_Address(src2));
2576   } else {
2577     lea(rscratch1, src2);
2578     Assembler::cmpq(src1, Address(rscratch1, 0));
2579   }
2580 #else
2581   if (src2.is_lval()) {
2582     cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2583   } else {
2584     cmpl(src1, as_Address(src2));
2585   }
2586 #endif // _LP64
2587 }
2588 
2589 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
2590   assert(src2.is_lval(), "not a mem-mem compare");
2591 #ifdef _LP64
2592   // moves src2's literal address
2593   movptr(rscratch1, src2);
2594   Assembler::cmpq(src1, rscratch1);
2595 #else
2596   cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2597 #endif // _LP64
2598 }
2599 
2600 void MacroAssembler::cmpoop(Register src1, Register src2) {
2601   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2602   bs->obj_equals(this, src1, src2);
2603 }
2604 
2605 void MacroAssembler::cmpoop(Register src1, Address src2) {
2606   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2607   bs->obj_equals(this, src1, src2);
2608 }
2609 
2610 #ifdef _LP64
2611 void MacroAssembler::cmpoop(Register src1, jobject src2) {
2612   movoop(rscratch1, src2);
2613   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2614   bs->obj_equals(this, src1, rscratch1);
2615 }
2616 #endif
2617 
2618 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
2619   if (reachable(adr)) {
2620     lock();
2621     cmpxchgptr(reg, as_Address(adr));
2622   } else {
2623     lea(rscratch1, adr);
2624     lock();
2625     cmpxchgptr(reg, Address(rscratch1, 0));
2626   }
2627 }
2628 
2629 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
2630   LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
2631 }
2632 
2633 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
2634   if (reachable(src)) {
2635     Assembler::comisd(dst, as_Address(src));
2636   } else {
2637     lea(rscratch1, src);
2638     Assembler::comisd(dst, Address(rscratch1, 0));
2639   }
2640 }
2641 
2642 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
2643   if (reachable(src)) {
2644     Assembler::comiss(dst, as_Address(src));
2645   } else {
2646     lea(rscratch1, src);
2647     Assembler::comiss(dst, Address(rscratch1, 0));
2648   }
2649 }
2650 
2651 
2652 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
2653   Condition negated_cond = negate_condition(cond);
2654   Label L;
2655   jcc(negated_cond, L);
2656   pushf(); // Preserve flags
2657   atomic_incl(counter_addr);
2658   popf();
2659   bind(L);
2660 }
2661 
2662 int MacroAssembler::corrected_idivl(Register reg) {
2663   // Full implementation of Java idiv and irem; checks for
2664   // special case as described in JVM spec., p.243 & p.271.
2665   // The function returns the (pc) offset of the idivl
2666   // instruction - may be needed for implicit exceptions.
2667   //
2668   //         normal case                           special case
2669   //
2670   // input : rax,: dividend                         min_int
2671   //         reg: divisor   (may not be rax,/rdx)   -1
2672   //
2673   // output: rax,: quotient  (= rax, idiv reg)       min_int
2674   //         rdx: remainder (= rax, irem reg)       0
2675   assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
2676   const int min_int = 0x80000000;
2677   Label normal_case, special_case;
2678 
2679   // check for special case
2680   cmpl(rax, min_int);
2681   jcc(Assembler::notEqual, normal_case);
2682   xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
2683   cmpl(reg, -1);
2684   jcc(Assembler::equal, special_case);
2685 
2686   // handle normal case
2687   bind(normal_case);
2688   cdql();
2689   int idivl_offset = offset();
2690   idivl(reg);
2691 
2692   // normal and special case exit
2693   bind(special_case);
2694 
2695   return idivl_offset;
2696 }
2697 
2698 
2699 
2700 void MacroAssembler::decrementl(Register reg, int value) {
2701   if (value == min_jint) {subl(reg, value) ; return; }
2702   if (value <  0) { incrementl(reg, -value); return; }
2703   if (value == 0) {                        ; return; }
2704   if (value == 1 && UseIncDec) { decl(reg) ; return; }
2705   /* else */      { subl(reg, value)       ; return; }
2706 }
2707 
2708 void MacroAssembler::decrementl(Address dst, int value) {
2709   if (value == min_jint) {subl(dst, value) ; return; }
2710   if (value <  0) { incrementl(dst, -value); return; }
2711   if (value == 0) {                        ; return; }
2712   if (value == 1 && UseIncDec) { decl(dst) ; return; }
2713   /* else */      { subl(dst, value)       ; return; }
2714 }
2715 
2716 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
2717   assert (shift_value > 0, "illegal shift value");
2718   Label _is_positive;
2719   testl (reg, reg);
2720   jcc (Assembler::positive, _is_positive);
2721   int offset = (1 << shift_value) - 1 ;
2722 
2723   if (offset == 1) {
2724     incrementl(reg);
2725   } else {
2726     addl(reg, offset);
2727   }
2728 
2729   bind (_is_positive);
2730   sarl(reg, shift_value);
2731 }
2732 
2733 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
2734   if (reachable(src)) {
2735     Assembler::divsd(dst, as_Address(src));
2736   } else {
2737     lea(rscratch1, src);
2738     Assembler::divsd(dst, Address(rscratch1, 0));
2739   }
2740 }
2741 
2742 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
2743   if (reachable(src)) {
2744     Assembler::divss(dst, as_Address(src));
2745   } else {
2746     lea(rscratch1, src);
2747     Assembler::divss(dst, Address(rscratch1, 0));
2748   }
2749 }
2750 
2751 // !defined(COMPILER2) is because of stupid core builds
2752 #if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2) || INCLUDE_JVMCI
2753 void MacroAssembler::empty_FPU_stack() {
2754   if (VM_Version::supports_mmx()) {
2755     emms();
2756   } else {
2757     for (int i = 8; i-- > 0; ) ffree(i);
2758   }
2759 }
2760 #endif // !LP64 || C1 || !C2 || INCLUDE_JVMCI
2761 
2762 
2763 void MacroAssembler::enter() {
2764   push(rbp);
2765   mov(rbp, rsp);
2766 }
2767 
2768 // A 5 byte nop that is safe for patching (see patch_verified_entry)
2769 void MacroAssembler::fat_nop() {
2770   if (UseAddressNop) {
2771     addr_nop_5();
2772   } else {
2773     emit_int8(0x26); // es:
2774     emit_int8(0x2e); // cs:
2775     emit_int8(0x64); // fs:
2776     emit_int8(0x65); // gs:
2777     emit_int8((unsigned char)0x90);
2778   }
2779 }
2780 
2781 void MacroAssembler::fcmp(Register tmp) {
2782   fcmp(tmp, 1, true, true);
2783 }
2784 
2785 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
2786   assert(!pop_right || pop_left, "usage error");
2787   if (VM_Version::supports_cmov()) {
2788     assert(tmp == noreg, "unneeded temp");
2789     if (pop_left) {
2790       fucomip(index);
2791     } else {
2792       fucomi(index);
2793     }
2794     if (pop_right) {
2795       fpop();
2796     }
2797   } else {
2798     assert(tmp != noreg, "need temp");
2799     if (pop_left) {
2800       if (pop_right) {
2801         fcompp();
2802       } else {
2803         fcomp(index);
2804       }
2805     } else {
2806       fcom(index);
2807     }
2808     // convert FPU condition into eflags condition via rax,
2809     save_rax(tmp);
2810     fwait(); fnstsw_ax();
2811     sahf();
2812     restore_rax(tmp);
2813   }
2814   // condition codes set as follows:
2815   //
2816   // CF (corresponds to C0) if x < y
2817   // PF (corresponds to C2) if unordered
2818   // ZF (corresponds to C3) if x = y
2819 }
2820 
2821 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
2822   fcmp2int(dst, unordered_is_less, 1, true, true);
2823 }
2824 
2825 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
2826   fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
2827   Label L;
2828   if (unordered_is_less) {
2829     movl(dst, -1);
2830     jcc(Assembler::parity, L);
2831     jcc(Assembler::below , L);
2832     movl(dst, 0);
2833     jcc(Assembler::equal , L);
2834     increment(dst);
2835   } else { // unordered is greater
2836     movl(dst, 1);
2837     jcc(Assembler::parity, L);
2838     jcc(Assembler::above , L);
2839     movl(dst, 0);
2840     jcc(Assembler::equal , L);
2841     decrementl(dst);
2842   }
2843   bind(L);
2844 }
2845 
2846 void MacroAssembler::fld_d(AddressLiteral src) {
2847   fld_d(as_Address(src));
2848 }
2849 
2850 void MacroAssembler::fld_s(AddressLiteral src) {
2851   fld_s(as_Address(src));
2852 }
2853 
2854 void MacroAssembler::fld_x(AddressLiteral src) {
2855   Assembler::fld_x(as_Address(src));
2856 }
2857 
2858 void MacroAssembler::fldcw(AddressLiteral src) {
2859   Assembler::fldcw(as_Address(src));
2860 }
2861 
2862 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
2863   if (reachable(src)) {
2864     Assembler::mulpd(dst, as_Address(src));
2865   } else {
2866     lea(rscratch1, src);
2867     Assembler::mulpd(dst, Address(rscratch1, 0));
2868   }
2869 }
2870 
2871 void MacroAssembler::increase_precision() {
2872   subptr(rsp, BytesPerWord);
2873   fnstcw(Address(rsp, 0));
2874   movl(rax, Address(rsp, 0));
2875   orl(rax, 0x300);
2876   push(rax);
2877   fldcw(Address(rsp, 0));
2878   pop(rax);
2879 }
2880 
2881 void MacroAssembler::restore_precision() {
2882   fldcw(Address(rsp, 0));
2883   addptr(rsp, BytesPerWord);
2884 }
2885 
2886 void MacroAssembler::fpop() {
2887   ffree();
2888   fincstp();
2889 }
2890 
2891 void MacroAssembler::load_float(Address src) {
2892   if (UseSSE >= 1) {
2893     movflt(xmm0, src);
2894   } else {
2895     LP64_ONLY(ShouldNotReachHere());
2896     NOT_LP64(fld_s(src));
2897   }
2898 }
2899 
2900 void MacroAssembler::store_float(Address dst) {
2901   if (UseSSE >= 1) {
2902     movflt(dst, xmm0);
2903   } else {
2904     LP64_ONLY(ShouldNotReachHere());
2905     NOT_LP64(fstp_s(dst));
2906   }
2907 }
2908 
2909 void MacroAssembler::load_double(Address src) {
2910   if (UseSSE >= 2) {
2911     movdbl(xmm0, src);
2912   } else {
2913     LP64_ONLY(ShouldNotReachHere());
2914     NOT_LP64(fld_d(src));
2915   }
2916 }
2917 
2918 void MacroAssembler::store_double(Address dst) {
2919   if (UseSSE >= 2) {
2920     movdbl(dst, xmm0);
2921   } else {
2922     LP64_ONLY(ShouldNotReachHere());
2923     NOT_LP64(fstp_d(dst));
2924   }
2925 }
2926 
2927 void MacroAssembler::fremr(Register tmp) {
2928   save_rax(tmp);
2929   { Label L;
2930     bind(L);
2931     fprem();
2932     fwait(); fnstsw_ax();
2933 #ifdef _LP64
2934     testl(rax, 0x400);
2935     jcc(Assembler::notEqual, L);
2936 #else
2937     sahf();
2938     jcc(Assembler::parity, L);
2939 #endif // _LP64
2940   }
2941   restore_rax(tmp);
2942   // Result is in ST0.
2943   // Note: fxch & fpop to get rid of ST1
2944   // (otherwise FPU stack could overflow eventually)
2945   fxch(1);
2946   fpop();
2947 }
2948 
2949 // dst = c = a * b + c
2950 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2951   Assembler::vfmadd231sd(c, a, b);
2952   if (dst != c) {
2953     movdbl(dst, c);
2954   }
2955 }
2956 
2957 // dst = c = a * b + c
2958 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2959   Assembler::vfmadd231ss(c, a, b);
2960   if (dst != c) {
2961     movflt(dst, c);
2962   }
2963 }
2964 
2965 // dst = c = a * b + c
2966 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2967   Assembler::vfmadd231pd(c, a, b, vector_len);
2968   if (dst != c) {
2969     vmovdqu(dst, c);
2970   }
2971 }
2972 
2973 // dst = c = a * b + c
2974 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2975   Assembler::vfmadd231ps(c, a, b, vector_len);
2976   if (dst != c) {
2977     vmovdqu(dst, c);
2978   }
2979 }
2980 
2981 // dst = c = a * b + c
2982 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2983   Assembler::vfmadd231pd(c, a, b, vector_len);
2984   if (dst != c) {
2985     vmovdqu(dst, c);
2986   }
2987 }
2988 
2989 // dst = c = a * b + c
2990 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2991   Assembler::vfmadd231ps(c, a, b, vector_len);
2992   if (dst != c) {
2993     vmovdqu(dst, c);
2994   }
2995 }
2996 
2997 void MacroAssembler::incrementl(AddressLiteral dst) {
2998   if (reachable(dst)) {
2999     incrementl(as_Address(dst));
3000   } else {
3001     lea(rscratch1, dst);
3002     incrementl(Address(rscratch1, 0));
3003   }
3004 }
3005 
3006 void MacroAssembler::incrementl(ArrayAddress dst) {
3007   incrementl(as_Address(dst));
3008 }
3009 
3010 void MacroAssembler::incrementl(Register reg, int value) {
3011   if (value == min_jint) {addl(reg, value) ; return; }
3012   if (value <  0) { decrementl(reg, -value); return; }
3013   if (value == 0) {                        ; return; }
3014   if (value == 1 && UseIncDec) { incl(reg) ; return; }
3015   /* else */      { addl(reg, value)       ; return; }
3016 }
3017 
3018 void MacroAssembler::incrementl(Address dst, int value) {
3019   if (value == min_jint) {addl(dst, value) ; return; }
3020   if (value <  0) { decrementl(dst, -value); return; }
3021   if (value == 0) {                        ; return; }
3022   if (value == 1 && UseIncDec) { incl(dst) ; return; }
3023   /* else */      { addl(dst, value)       ; return; }
3024 }
3025 
3026 void MacroAssembler::jump(AddressLiteral dst) {
3027   if (reachable(dst)) {
3028     jmp_literal(dst.target(), dst.rspec());
3029   } else {
3030     lea(rscratch1, dst);
3031     jmp(rscratch1);
3032   }
3033 }
3034 
3035 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
3036   if (reachable(dst)) {
3037     InstructionMark im(this);
3038     relocate(dst.reloc());
3039     const int short_size = 2;
3040     const int long_size = 6;
3041     int offs = (intptr_t)dst.target() - ((intptr_t)pc());
3042     if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
3043       // 0111 tttn #8-bit disp
3044       emit_int8(0x70 | cc);
3045       emit_int8((offs - short_size) & 0xFF);
3046     } else {
3047       // 0000 1111 1000 tttn #32-bit disp
3048       emit_int8(0x0F);
3049       emit_int8((unsigned char)(0x80 | cc));
3050       emit_int32(offs - long_size);
3051     }
3052   } else {
3053 #ifdef ASSERT
3054     warning("reversing conditional branch");
3055 #endif /* ASSERT */
3056     Label skip;
3057     jccb(reverse[cc], skip);
3058     lea(rscratch1, dst);
3059     Assembler::jmp(rscratch1);
3060     bind(skip);
3061   }
3062 }
3063 
3064 void MacroAssembler::ldmxcsr(AddressLiteral src) {
3065   if (reachable(src)) {
3066     Assembler::ldmxcsr(as_Address(src));
3067   } else {
3068     lea(rscratch1, src);
3069     Assembler::ldmxcsr(Address(rscratch1, 0));
3070   }
3071 }
3072 
3073 int MacroAssembler::load_signed_byte(Register dst, Address src) {
3074   int off;
3075   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3076     off = offset();
3077     movsbl(dst, src); // movsxb
3078   } else {
3079     off = load_unsigned_byte(dst, src);
3080     shll(dst, 24);
3081     sarl(dst, 24);
3082   }
3083   return off;
3084 }
3085 
3086 // Note: load_signed_short used to be called load_signed_word.
3087 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
3088 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
3089 // The term "word" in HotSpot means a 32- or 64-bit machine word.
3090 int MacroAssembler::load_signed_short(Register dst, Address src) {
3091   int off;
3092   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3093     // This is dubious to me since it seems safe to do a signed 16 => 64 bit
3094     // version but this is what 64bit has always done. This seems to imply
3095     // that users are only using 32bits worth.
3096     off = offset();
3097     movswl(dst, src); // movsxw
3098   } else {
3099     off = load_unsigned_short(dst, src);
3100     shll(dst, 16);
3101     sarl(dst, 16);
3102   }
3103   return off;
3104 }
3105 
3106 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
3107   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3108   // and "3.9 Partial Register Penalties", p. 22).
3109   int off;
3110   if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
3111     off = offset();
3112     movzbl(dst, src); // movzxb
3113   } else {
3114     xorl(dst, dst);
3115     off = offset();
3116     movb(dst, src);
3117   }
3118   return off;
3119 }
3120 
3121 // Note: load_unsigned_short used to be called load_unsigned_word.
3122 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
3123   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3124   // and "3.9 Partial Register Penalties", p. 22).
3125   int off;
3126   if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
3127     off = offset();
3128     movzwl(dst, src); // movzxw
3129   } else {
3130     xorl(dst, dst);
3131     off = offset();
3132     movw(dst, src);
3133   }
3134   return off;
3135 }
3136 
3137 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
3138   switch (size_in_bytes) {
3139 #ifndef _LP64
3140   case  8:
3141     assert(dst2 != noreg, "second dest register required");
3142     movl(dst,  src);
3143     movl(dst2, src.plus_disp(BytesPerInt));
3144     break;
3145 #else
3146   case  8:  movq(dst, src); break;
3147 #endif
3148   case  4:  movl(dst, src); break;
3149   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
3150   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
3151   default:  ShouldNotReachHere();
3152   }
3153 }
3154 
3155 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
3156   switch (size_in_bytes) {
3157 #ifndef _LP64
3158   case  8:
3159     assert(src2 != noreg, "second source register required");
3160     movl(dst,                        src);
3161     movl(dst.plus_disp(BytesPerInt), src2);
3162     break;
3163 #else
3164   case  8:  movq(dst, src); break;
3165 #endif
3166   case  4:  movl(dst, src); break;
3167   case  2:  movw(dst, src); break;
3168   case  1:  movb(dst, src); break;
3169   default:  ShouldNotReachHere();
3170   }
3171 }
3172 
3173 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
3174   if (reachable(dst)) {
3175     movl(as_Address(dst), src);
3176   } else {
3177     lea(rscratch1, dst);
3178     movl(Address(rscratch1, 0), src);
3179   }
3180 }
3181 
3182 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
3183   if (reachable(src)) {
3184     movl(dst, as_Address(src));
3185   } else {
3186     lea(rscratch1, src);
3187     movl(dst, Address(rscratch1, 0));
3188   }
3189 }
3190 
3191 // C++ bool manipulation
3192 
3193 void MacroAssembler::movbool(Register dst, Address src) {
3194   if(sizeof(bool) == 1)
3195     movb(dst, src);
3196   else if(sizeof(bool) == 2)
3197     movw(dst, src);
3198   else if(sizeof(bool) == 4)
3199     movl(dst, src);
3200   else
3201     // unsupported
3202     ShouldNotReachHere();
3203 }
3204 
3205 void MacroAssembler::movbool(Address dst, bool boolconst) {
3206   if(sizeof(bool) == 1)
3207     movb(dst, (int) boolconst);
3208   else if(sizeof(bool) == 2)
3209     movw(dst, (int) boolconst);
3210   else if(sizeof(bool) == 4)
3211     movl(dst, (int) boolconst);
3212   else
3213     // unsupported
3214     ShouldNotReachHere();
3215 }
3216 
3217 void MacroAssembler::movbool(Address dst, Register src) {
3218   if(sizeof(bool) == 1)
3219     movb(dst, src);
3220   else if(sizeof(bool) == 2)
3221     movw(dst, src);
3222   else if(sizeof(bool) == 4)
3223     movl(dst, src);
3224   else
3225     // unsupported
3226     ShouldNotReachHere();
3227 }
3228 
3229 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
3230   movb(as_Address(dst), src);
3231 }
3232 
3233 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
3234   if (reachable(src)) {
3235     movdl(dst, as_Address(src));
3236   } else {
3237     lea(rscratch1, src);
3238     movdl(dst, Address(rscratch1, 0));
3239   }
3240 }
3241 
3242 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
3243   if (reachable(src)) {
3244     movq(dst, as_Address(src));
3245   } else {
3246     lea(rscratch1, src);
3247     movq(dst, Address(rscratch1, 0));
3248   }
3249 }
3250 
3251 #ifdef COMPILER2
3252 void MacroAssembler::setvectmask(Register dst, Register src) {
3253   guarantee(PostLoopMultiversioning, "must be");
3254   Assembler::movl(dst, 1);
3255   Assembler::shlxl(dst, dst, src);
3256   Assembler::decl(dst);
3257   Assembler::kmovdl(k1, dst);
3258   Assembler::movl(dst, src);
3259 }
3260 
3261 void MacroAssembler::restorevectmask() {
3262   guarantee(PostLoopMultiversioning, "must be");
3263   Assembler::knotwl(k1, k0);
3264 }
3265 #endif // COMPILER2
3266 
3267 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
3268   if (reachable(src)) {
3269     if (UseXmmLoadAndClearUpper) {
3270       movsd (dst, as_Address(src));
3271     } else {
3272       movlpd(dst, as_Address(src));
3273     }
3274   } else {
3275     lea(rscratch1, src);
3276     if (UseXmmLoadAndClearUpper) {
3277       movsd (dst, Address(rscratch1, 0));
3278     } else {
3279       movlpd(dst, Address(rscratch1, 0));
3280     }
3281   }
3282 }
3283 
3284 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
3285   if (reachable(src)) {
3286     movss(dst, as_Address(src));
3287   } else {
3288     lea(rscratch1, src);
3289     movss(dst, Address(rscratch1, 0));
3290   }
3291 }
3292 
3293 void MacroAssembler::movptr(Register dst, Register src) {
3294   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3295 }
3296 
3297 void MacroAssembler::movptr(Register dst, Address src) {
3298   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3299 }
3300 
3301 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
3302 void MacroAssembler::movptr(Register dst, intptr_t src) {
3303   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
3304 }
3305 
3306 void MacroAssembler::movptr(Address dst, Register src) {
3307   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3308 }
3309 
3310 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
3311     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3312     Assembler::movdqu(dst, src);
3313 }
3314 
3315 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
3316     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3317     Assembler::movdqu(dst, src);
3318 }
3319 
3320 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
3321     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3322     Assembler::movdqu(dst, src);
3323 }
3324 
3325 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
3326   if (reachable(src)) {
3327     movdqu(dst, as_Address(src));
3328   } else {
3329     lea(scratchReg, src);
3330     movdqu(dst, Address(scratchReg, 0));
3331   }
3332 }
3333 
3334 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
3335     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3336     Assembler::vmovdqu(dst, src);
3337 }
3338 
3339 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
3340     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3341     Assembler::vmovdqu(dst, src);
3342 }
3343 
3344 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
3345     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3346     Assembler::vmovdqu(dst, src);
3347 }
3348 
3349 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3350   if (reachable(src)) {
3351     vmovdqu(dst, as_Address(src));
3352   }
3353   else {
3354     lea(scratch_reg, src);
3355     vmovdqu(dst, Address(scratch_reg, 0));
3356   }
3357 }
3358 
3359 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
3360   if (reachable(src)) {
3361     Assembler::evmovdquq(dst, as_Address(src), vector_len);
3362   } else {
3363     lea(rscratch, src);
3364     Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
3365   }
3366 }
3367 
3368 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
3369   if (reachable(src)) {
3370     Assembler::movdqa(dst, as_Address(src));
3371   } else {
3372     lea(rscratch1, src);
3373     Assembler::movdqa(dst, Address(rscratch1, 0));
3374   }
3375 }
3376 
3377 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
3378   if (reachable(src)) {
3379     Assembler::movsd(dst, as_Address(src));
3380   } else {
3381     lea(rscratch1, src);
3382     Assembler::movsd(dst, Address(rscratch1, 0));
3383   }
3384 }
3385 
3386 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
3387   if (reachable(src)) {
3388     Assembler::movss(dst, as_Address(src));
3389   } else {
3390     lea(rscratch1, src);
3391     Assembler::movss(dst, Address(rscratch1, 0));
3392   }
3393 }
3394 
3395 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
3396   if (reachable(src)) {
3397     Assembler::mulsd(dst, as_Address(src));
3398   } else {
3399     lea(rscratch1, src);
3400     Assembler::mulsd(dst, Address(rscratch1, 0));
3401   }
3402 }
3403 
3404 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
3405   if (reachable(src)) {
3406     Assembler::mulss(dst, as_Address(src));
3407   } else {
3408     lea(rscratch1, src);
3409     Assembler::mulss(dst, Address(rscratch1, 0));
3410   }
3411 }
3412 
3413 void MacroAssembler::null_check(Register reg, int offset) {
3414   if (needs_explicit_null_check(offset)) {
3415     // provoke OS NULL exception if reg = NULL by
3416     // accessing M[reg] w/o changing any (non-CC) registers
3417     // NOTE: cmpl is plenty here to provoke a segv
3418     cmpptr(rax, Address(reg, 0));
3419     // Note: should probably use testl(rax, Address(reg, 0));
3420     //       may be shorter code (however, this version of
3421     //       testl needs to be implemented first)
3422   } else {
3423     // nothing to do, (later) access of M[reg + offset]
3424     // will provoke OS NULL exception if reg = NULL
3425   }
3426 }
3427 
3428 void MacroAssembler::test_klass_is_value(Register klass, Register temp_reg, Label& is_value) {
3429   movl(temp_reg, Address(klass, Klass::access_flags_offset()));
3430   testl(temp_reg, JVM_ACC_VALUE);
3431   jcc(Assembler::notZero, is_value);
3432 }
3433 
3434 void MacroAssembler::test_field_is_flattenable(Register flags, Register temp_reg, Label& is_flattenable) {
3435   movl(temp_reg, flags);
3436   shrl(temp_reg, ConstantPoolCacheEntry::is_flattenable_field_shift);
3437   andl(temp_reg, 0x1);
3438   testl(temp_reg, temp_reg);
3439   jcc(Assembler::notZero, is_flattenable);
3440 }
3441 
3442 void MacroAssembler::test_field_is_not_flattenable(Register flags, Register temp_reg, Label& notFlattenable) {
3443   movl(temp_reg, flags);
3444   shrl(temp_reg, ConstantPoolCacheEntry::is_flattenable_field_shift);
3445   andl(temp_reg, 0x1);
3446   testl(temp_reg, temp_reg);
3447   jcc(Assembler::zero, notFlattenable);
3448 }
3449 
3450 void MacroAssembler::test_field_is_flattened(Register flags, Register temp_reg, Label& is_flattened) {
3451   movl(temp_reg, flags);
3452   shrl(temp_reg, ConstantPoolCacheEntry::is_flattened_field_shift);
3453   andl(temp_reg, 0x1);
3454   testl(temp_reg, temp_reg);
3455   jcc(Assembler::notZero, is_flattened);
3456 }
3457 
3458 void MacroAssembler::test_flattened_array_oop(Register oop, Register temp_reg,
3459                                               Label&is_flattened_array) {
3460   load_storage_props(temp_reg, oop);
3461   testb(temp_reg, ArrayStorageProperties::flattened_value);
3462   jcc(Assembler::notZero, is_flattened_array);
3463 }
3464 
3465 void MacroAssembler::test_null_free_array_oop(Register oop, Register temp_reg, Label&is_null_free_array) {
3466   load_storage_props(temp_reg, oop);
3467   testb(temp_reg, ArrayStorageProperties::null_free_value);
3468   jcc(Assembler::notZero, is_null_free_array);
3469 }
3470 
3471 void MacroAssembler::os_breakpoint() {
3472   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
3473   // (e.g., MSVC can't call ps() otherwise)
3474   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
3475 }
3476 
3477 void MacroAssembler::unimplemented(const char* what) {
3478   const char* buf = NULL;
3479   {
3480     ResourceMark rm;
3481     stringStream ss;
3482     ss.print("unimplemented: %s", what);
3483     buf = code_string(ss.as_string());
3484   }
3485   stop(buf);
3486 }
3487 
3488 #ifdef _LP64
3489 #define XSTATE_BV 0x200
3490 #endif
3491 
3492 void MacroAssembler::pop_CPU_state() {
3493   pop_FPU_state();
3494   pop_IU_state();
3495 }
3496 
3497 void MacroAssembler::pop_FPU_state() {
3498 #ifndef _LP64
3499   frstor(Address(rsp, 0));
3500 #else
3501   fxrstor(Address(rsp, 0));
3502 #endif
3503   addptr(rsp, FPUStateSizeInWords * wordSize);
3504 }
3505 
3506 void MacroAssembler::pop_IU_state() {
3507   popa();
3508   LP64_ONLY(addq(rsp, 8));
3509   popf();
3510 }
3511 
3512 // Save Integer and Float state
3513 // Warning: Stack must be 16 byte aligned (64bit)
3514 void MacroAssembler::push_CPU_state() {
3515   push_IU_state();
3516   push_FPU_state();
3517 }
3518 
3519 void MacroAssembler::push_FPU_state() {
3520   subptr(rsp, FPUStateSizeInWords * wordSize);
3521 #ifndef _LP64
3522   fnsave(Address(rsp, 0));
3523   fwait();
3524 #else
3525   fxsave(Address(rsp, 0));
3526 #endif // LP64
3527 }
3528 
3529 void MacroAssembler::push_IU_state() {
3530   // Push flags first because pusha kills them
3531   pushf();
3532   // Make sure rsp stays 16-byte aligned
3533   LP64_ONLY(subq(rsp, 8));
3534   pusha();
3535 }
3536 
3537 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register
3538   if (!java_thread->is_valid()) {
3539     java_thread = rdi;
3540     get_thread(java_thread);
3541   }
3542   // we must set sp to zero to clear frame
3543   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
3544   if (clear_fp) {
3545     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3546   }
3547 
3548   // Always clear the pc because it could have been set by make_walkable()
3549   movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
3550 
3551   vzeroupper();
3552 }
3553 
3554 void MacroAssembler::restore_rax(Register tmp) {
3555   if (tmp == noreg) pop(rax);
3556   else if (tmp != rax) mov(rax, tmp);
3557 }
3558 
3559 void MacroAssembler::round_to(Register reg, int modulus) {
3560   addptr(reg, modulus - 1);
3561   andptr(reg, -modulus);
3562 }
3563 
3564 void MacroAssembler::save_rax(Register tmp) {
3565   if (tmp == noreg) push(rax);
3566   else if (tmp != rax) mov(tmp, rax);
3567 }
3568 
3569 void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, Register temp_reg) {
3570   if (SafepointMechanism::uses_thread_local_poll()) {
3571 #ifdef _LP64
3572     assert(thread_reg == r15_thread, "should be");
3573 #else
3574     if (thread_reg == noreg) {
3575       thread_reg = temp_reg;
3576       get_thread(thread_reg);
3577     }
3578 #endif
3579     testb(Address(thread_reg, Thread::polling_page_offset()), SafepointMechanism::poll_bit());
3580     jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
3581   } else {
3582     cmp32(ExternalAddress(SafepointSynchronize::address_of_state()),
3583         SafepointSynchronize::_not_synchronized);
3584     jcc(Assembler::notEqual, slow_path);
3585   }
3586 }
3587 
3588 // Calls to C land
3589 //
3590 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
3591 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
3592 // has to be reset to 0. This is required to allow proper stack traversal.
3593 void MacroAssembler::set_last_Java_frame(Register java_thread,
3594                                          Register last_java_sp,
3595                                          Register last_java_fp,
3596                                          address  last_java_pc) {
3597   vzeroupper();
3598   // determine java_thread register
3599   if (!java_thread->is_valid()) {
3600     java_thread = rdi;
3601     get_thread(java_thread);
3602   }
3603   // determine last_java_sp register
3604   if (!last_java_sp->is_valid()) {
3605     last_java_sp = rsp;
3606   }
3607 
3608   // last_java_fp is optional
3609 
3610   if (last_java_fp->is_valid()) {
3611     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
3612   }
3613 
3614   // last_java_pc is optional
3615 
3616   if (last_java_pc != NULL) {
3617     lea(Address(java_thread,
3618                  JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
3619         InternalAddress(last_java_pc));
3620 
3621   }
3622   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
3623 }
3624 
3625 void MacroAssembler::shlptr(Register dst, int imm8) {
3626   LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
3627 }
3628 
3629 void MacroAssembler::shrptr(Register dst, int imm8) {
3630   LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
3631 }
3632 
3633 void MacroAssembler::sign_extend_byte(Register reg) {
3634   if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
3635     movsbl(reg, reg); // movsxb
3636   } else {
3637     shll(reg, 24);
3638     sarl(reg, 24);
3639   }
3640 }
3641 
3642 void MacroAssembler::sign_extend_short(Register reg) {
3643   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3644     movswl(reg, reg); // movsxw
3645   } else {
3646     shll(reg, 16);
3647     sarl(reg, 16);
3648   }
3649 }
3650 
3651 void MacroAssembler::testl(Register dst, AddressLiteral src) {
3652   assert(reachable(src), "Address should be reachable");
3653   testl(dst, as_Address(src));
3654 }
3655 
3656 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
3657   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3658   Assembler::pcmpeqb(dst, src);
3659 }
3660 
3661 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
3662   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3663   Assembler::pcmpeqw(dst, src);
3664 }
3665 
3666 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
3667   assert((dst->encoding() < 16),"XMM register should be 0-15");
3668   Assembler::pcmpestri(dst, src, imm8);
3669 }
3670 
3671 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
3672   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3673   Assembler::pcmpestri(dst, src, imm8);
3674 }
3675 
3676 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
3677   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3678   Assembler::pmovzxbw(dst, src);
3679 }
3680 
3681 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
3682   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3683   Assembler::pmovzxbw(dst, src);
3684 }
3685 
3686 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
3687   assert((src->encoding() < 16),"XMM register should be 0-15");
3688   Assembler::pmovmskb(dst, src);
3689 }
3690 
3691 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
3692   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3693   Assembler::ptest(dst, src);
3694 }
3695 
3696 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
3697   if (reachable(src)) {
3698     Assembler::sqrtsd(dst, as_Address(src));
3699   } else {
3700     lea(rscratch1, src);
3701     Assembler::sqrtsd(dst, Address(rscratch1, 0));
3702   }
3703 }
3704 
3705 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
3706   if (reachable(src)) {
3707     Assembler::sqrtss(dst, as_Address(src));
3708   } else {
3709     lea(rscratch1, src);
3710     Assembler::sqrtss(dst, Address(rscratch1, 0));
3711   }
3712 }
3713 
3714 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
3715   if (reachable(src)) {
3716     Assembler::subsd(dst, as_Address(src));
3717   } else {
3718     lea(rscratch1, src);
3719     Assembler::subsd(dst, Address(rscratch1, 0));
3720   }
3721 }
3722 
3723 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
3724   if (reachable(src)) {
3725     Assembler::subss(dst, as_Address(src));
3726   } else {
3727     lea(rscratch1, src);
3728     Assembler::subss(dst, Address(rscratch1, 0));
3729   }
3730 }
3731 
3732 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
3733   if (reachable(src)) {
3734     Assembler::ucomisd(dst, as_Address(src));
3735   } else {
3736     lea(rscratch1, src);
3737     Assembler::ucomisd(dst, Address(rscratch1, 0));
3738   }
3739 }
3740 
3741 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
3742   if (reachable(src)) {
3743     Assembler::ucomiss(dst, as_Address(src));
3744   } else {
3745     lea(rscratch1, src);
3746     Assembler::ucomiss(dst, Address(rscratch1, 0));
3747   }
3748 }
3749 
3750 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3751   // Used in sign-bit flipping with aligned address.
3752   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3753   if (reachable(src)) {
3754     Assembler::xorpd(dst, as_Address(src));
3755   } else {
3756     lea(scratch_reg, src);
3757     Assembler::xorpd(dst, Address(scratch_reg, 0));
3758   }
3759 }
3760 
3761 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
3762   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3763     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3764   }
3765   else {
3766     Assembler::xorpd(dst, src);
3767   }
3768 }
3769 
3770 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
3771   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
3772     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
3773   } else {
3774     Assembler::xorps(dst, src);
3775   }
3776 }
3777 
3778 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
3779   // Used in sign-bit flipping with aligned address.
3780   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3781   if (reachable(src)) {
3782     Assembler::xorps(dst, as_Address(src));
3783   } else {
3784     lea(scratch_reg, src);
3785     Assembler::xorps(dst, Address(scratch_reg, 0));
3786   }
3787 }
3788 
3789 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
3790   // Used in sign-bit flipping with aligned address.
3791   bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3792   assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3793   if (reachable(src)) {
3794     Assembler::pshufb(dst, as_Address(src));
3795   } else {
3796     lea(rscratch1, src);
3797     Assembler::pshufb(dst, Address(rscratch1, 0));
3798   }
3799 }
3800 
3801 // AVX 3-operands instructions
3802 
3803 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3804   if (reachable(src)) {
3805     vaddsd(dst, nds, as_Address(src));
3806   } else {
3807     lea(rscratch1, src);
3808     vaddsd(dst, nds, Address(rscratch1, 0));
3809   }
3810 }
3811 
3812 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3813   if (reachable(src)) {
3814     vaddss(dst, nds, as_Address(src));
3815   } else {
3816     lea(rscratch1, src);
3817     vaddss(dst, nds, Address(rscratch1, 0));
3818   }
3819 }
3820 
3821 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3822   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3823   vandps(dst, nds, negate_field, vector_len);
3824 }
3825 
3826 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3827   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3828   vandpd(dst, nds, negate_field, vector_len);
3829 }
3830 
3831 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3832   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3833   Assembler::vpaddb(dst, nds, src, vector_len);
3834 }
3835 
3836 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3837   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3838   Assembler::vpaddb(dst, nds, src, vector_len);
3839 }
3840 
3841 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3842   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3843   Assembler::vpaddw(dst, nds, src, vector_len);
3844 }
3845 
3846 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3847   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3848   Assembler::vpaddw(dst, nds, src, vector_len);
3849 }
3850 
3851 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3852   if (reachable(src)) {
3853     Assembler::vpand(dst, nds, as_Address(src), vector_len);
3854   } else {
3855     lea(scratch_reg, src);
3856     Assembler::vpand(dst, nds, Address(scratch_reg, 0), vector_len);
3857   }
3858 }
3859 
3860 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
3861   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3862   Assembler::vpbroadcastw(dst, src, vector_len);
3863 }
3864 
3865 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3866   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3867   Assembler::vpcmpeqb(dst, nds, src, vector_len);
3868 }
3869 
3870 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3871   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3872   Assembler::vpcmpeqw(dst, nds, src, vector_len);
3873 }
3874 
3875 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3876   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3877   Assembler::vpmovzxbw(dst, src, vector_len);
3878 }
3879 
3880 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src) {
3881   assert((src->encoding() < 16),"XMM register should be 0-15");
3882   Assembler::vpmovmskb(dst, src);
3883 }
3884 
3885 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3886   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3887   Assembler::vpmullw(dst, nds, src, vector_len);
3888 }
3889 
3890 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3891   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3892   Assembler::vpmullw(dst, nds, src, vector_len);
3893 }
3894 
3895 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3896   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3897   Assembler::vpsubb(dst, nds, src, vector_len);
3898 }
3899 
3900 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3901   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3902   Assembler::vpsubb(dst, nds, src, vector_len);
3903 }
3904 
3905 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3906   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3907   Assembler::vpsubw(dst, nds, src, vector_len);
3908 }
3909 
3910 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3911   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3912   Assembler::vpsubw(dst, nds, src, vector_len);
3913 }
3914 
3915 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3916   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3917   Assembler::vpsraw(dst, nds, shift, vector_len);
3918 }
3919 
3920 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3921   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3922   Assembler::vpsraw(dst, nds, shift, vector_len);
3923 }
3924 
3925 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3926   assert(UseAVX > 2,"");
3927   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3928      vector_len = 2;
3929   }
3930   Assembler::evpsraq(dst, nds, shift, vector_len);
3931 }
3932 
3933 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3934   assert(UseAVX > 2,"");
3935   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3936      vector_len = 2;
3937   }
3938   Assembler::evpsraq(dst, nds, shift, vector_len);
3939 }
3940 
3941 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3942   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3943   Assembler::vpsrlw(dst, nds, shift, vector_len);
3944 }
3945 
3946 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3947   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3948   Assembler::vpsrlw(dst, nds, shift, vector_len);
3949 }
3950 
3951 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3952   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3953   Assembler::vpsllw(dst, nds, shift, vector_len);
3954 }
3955 
3956 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3957   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3958   Assembler::vpsllw(dst, nds, shift, vector_len);
3959 }
3960 
3961 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3962   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3963   Assembler::vptest(dst, src);
3964 }
3965 
3966 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3967   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3968   Assembler::punpcklbw(dst, src);
3969 }
3970 
3971 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3972   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3973   Assembler::pshufd(dst, src, mode);
3974 }
3975 
3976 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3977   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3978   Assembler::pshuflw(dst, src, mode);
3979 }
3980 
3981 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3982   if (reachable(src)) {
3983     vandpd(dst, nds, as_Address(src), vector_len);
3984   } else {
3985     lea(scratch_reg, src);
3986     vandpd(dst, nds, Address(scratch_reg, 0), vector_len);
3987   }
3988 }
3989 
3990 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3991   if (reachable(src)) {
3992     vandps(dst, nds, as_Address(src), vector_len);
3993   } else {
3994     lea(scratch_reg, src);
3995     vandps(dst, nds, Address(scratch_reg, 0), vector_len);
3996   }
3997 }
3998 
3999 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4000   if (reachable(src)) {
4001     vdivsd(dst, nds, as_Address(src));
4002   } else {
4003     lea(rscratch1, src);
4004     vdivsd(dst, nds, Address(rscratch1, 0));
4005   }
4006 }
4007 
4008 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4009   if (reachable(src)) {
4010     vdivss(dst, nds, as_Address(src));
4011   } else {
4012     lea(rscratch1, src);
4013     vdivss(dst, nds, Address(rscratch1, 0));
4014   }
4015 }
4016 
4017 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4018   if (reachable(src)) {
4019     vmulsd(dst, nds, as_Address(src));
4020   } else {
4021     lea(rscratch1, src);
4022     vmulsd(dst, nds, Address(rscratch1, 0));
4023   }
4024 }
4025 
4026 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4027   if (reachable(src)) {
4028     vmulss(dst, nds, as_Address(src));
4029   } else {
4030     lea(rscratch1, src);
4031     vmulss(dst, nds, Address(rscratch1, 0));
4032   }
4033 }
4034 
4035 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4036   if (reachable(src)) {
4037     vsubsd(dst, nds, as_Address(src));
4038   } else {
4039     lea(rscratch1, src);
4040     vsubsd(dst, nds, Address(rscratch1, 0));
4041   }
4042 }
4043 
4044 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4045   if (reachable(src)) {
4046     vsubss(dst, nds, as_Address(src));
4047   } else {
4048     lea(rscratch1, src);
4049     vsubss(dst, nds, Address(rscratch1, 0));
4050   }
4051 }
4052 
4053 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4054   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
4055   vxorps(dst, nds, src, Assembler::AVX_128bit);
4056 }
4057 
4058 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4059   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
4060   vxorpd(dst, nds, src, Assembler::AVX_128bit);
4061 }
4062 
4063 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
4064   if (reachable(src)) {
4065     vxorpd(dst, nds, as_Address(src), vector_len);
4066   } else {
4067     lea(scratch_reg, src);
4068     vxorpd(dst, nds, Address(scratch_reg, 0), vector_len);
4069   }
4070 }
4071 
4072 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
4073   if (reachable(src)) {
4074     vxorps(dst, nds, as_Address(src), vector_len);
4075   } else {
4076     lea(scratch_reg, src);
4077     vxorps(dst, nds, Address(scratch_reg, 0), vector_len);
4078   }
4079 }
4080 
4081 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
4082   if (UseAVX > 1 || (vector_len < 1)) {
4083     if (reachable(src)) {
4084       Assembler::vpxor(dst, nds, as_Address(src), vector_len);
4085     } else {
4086       lea(scratch_reg, src);
4087       Assembler::vpxor(dst, nds, Address(scratch_reg, 0), vector_len);
4088     }
4089   }
4090   else {
4091     MacroAssembler::vxorpd(dst, nds, src, vector_len, scratch_reg);
4092   }
4093 }
4094 
4095 //-------------------------------------------------------------------------------------------
4096 #ifdef COMPILER2
4097 // Generic instructions support for use in .ad files C2 code generation
4098 
4099 void MacroAssembler::vabsnegd(int opcode, XMMRegister dst, Register scr) {
4100   if (opcode == Op_AbsVD) {
4101     andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
4102   } else {
4103     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
4104     xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
4105   }
4106 }
4107 
4108 void MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
4109   if (opcode == Op_AbsVD) {
4110     vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
4111   } else {
4112     assert((opcode == Op_NegVD),"opcode should be Op_NegD");
4113     vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
4114   }
4115 }
4116 
4117 void MacroAssembler::vabsnegf(int opcode, XMMRegister dst, Register scr) {
4118   if (opcode == Op_AbsVF) {
4119     andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
4120   } else {
4121     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
4122     xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
4123   }
4124 }
4125 
4126 void MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
4127   if (opcode == Op_AbsVF) {
4128     vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
4129   } else {
4130     assert((opcode == Op_NegVF),"opcode should be Op_NegF");
4131     vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
4132   }
4133 }
4134 
4135 void MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
4136   if (sign) {
4137     pmovsxbw(dst, src);
4138   } else {
4139     pmovzxbw(dst, src);
4140   }
4141 }
4142 
4143 void MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
4144   if (sign) {
4145     vpmovsxbw(dst, src, vector_len);
4146   } else {
4147     vpmovzxbw(dst, src, vector_len);
4148   }
4149 }
4150 
4151 void MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src) {
4152   if (opcode == Op_RShiftVI) {
4153     psrad(dst, src);
4154   } else if (opcode == Op_LShiftVI) {
4155     pslld(dst, src);
4156   } else {
4157     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
4158     psrld(dst, src);
4159   }
4160 }
4161 
4162 void MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4163   if (opcode == Op_RShiftVI) {
4164     vpsrad(dst, nds, src, vector_len);
4165   } else if (opcode == Op_LShiftVI) {
4166     vpslld(dst, nds, src, vector_len);
4167   } else {
4168     assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
4169     vpsrld(dst, nds, src, vector_len);
4170   }
4171 }
4172 
4173 void MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src) {
4174   if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) {
4175     psraw(dst, src);
4176   } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) {
4177     psllw(dst, src);
4178   } else {
4179     assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB");
4180     psrlw(dst, src);
4181   }
4182 }
4183 
4184 void MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4185   if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) {
4186     vpsraw(dst, nds, src, vector_len);
4187   } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) {
4188     vpsllw(dst, nds, src, vector_len);
4189   } else {
4190     assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB");
4191     vpsrlw(dst, nds, src, vector_len);
4192   }
4193 }
4194 
4195 void MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src) {
4196   if (opcode == Op_RShiftVL) {
4197     psrlq(dst, src);  // using srl to implement sra on pre-avs512 systems
4198   } else if (opcode == Op_LShiftVL) {
4199     psllq(dst, src);
4200   } else {
4201     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
4202     psrlq(dst, src);
4203   }
4204 }
4205 
4206 void MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4207   if (opcode == Op_RShiftVL) {
4208     evpsraq(dst, nds, src, vector_len);
4209   } else if (opcode == Op_LShiftVL) {
4210     vpsllq(dst, nds, src, vector_len);
4211   } else {
4212     assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
4213     vpsrlq(dst, nds, src, vector_len);
4214   }
4215 }
4216 #endif
4217 //-------------------------------------------------------------------------------------------
4218 
4219 void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
4220   const int32_t inverted_jweak_mask = ~static_cast<int32_t>(JNIHandles::weak_tag_mask);
4221   STATIC_ASSERT(inverted_jweak_mask == -2); // otherwise check this code
4222   // The inverted mask is sign-extended
4223   andptr(possibly_jweak, inverted_jweak_mask);
4224 }
4225 
4226 void MacroAssembler::resolve_jobject(Register value,
4227                                      Register thread,
4228                                      Register tmp) {
4229   assert_different_registers(value, thread, tmp);
4230   Label done, not_weak;
4231   testptr(value, value);
4232   jcc(Assembler::zero, done);                // Use NULL as-is.
4233   testptr(value, JNIHandles::weak_tag_mask); // Test for jweak tag.
4234   jcc(Assembler::zero, not_weak);
4235   // Resolve jweak.
4236   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
4237                  value, Address(value, -JNIHandles::weak_tag_value), tmp, thread);
4238   verify_oop(value);
4239   jmp(done);
4240   bind(not_weak);
4241   // Resolve (untagged) jobject.
4242   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
4243   verify_oop(value);
4244   bind(done);
4245 }
4246 
4247 void MacroAssembler::subptr(Register dst, int32_t imm32) {
4248   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
4249 }
4250 
4251 // Force generation of a 4 byte immediate value even if it fits into 8bit
4252 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
4253   LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
4254 }
4255 
4256 void MacroAssembler::subptr(Register dst, Register src) {
4257   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
4258 }
4259 
4260 // C++ bool manipulation
4261 void MacroAssembler::testbool(Register dst) {
4262   if(sizeof(bool) == 1)
4263     testb(dst, 0xff);
4264   else if(sizeof(bool) == 2) {
4265     // testw implementation needed for two byte bools
4266     ShouldNotReachHere();
4267   } else if(sizeof(bool) == 4)
4268     testl(dst, dst);
4269   else
4270     // unsupported
4271     ShouldNotReachHere();
4272 }
4273 
4274 void MacroAssembler::testptr(Register dst, Register src) {
4275   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
4276 }
4277 
4278 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4279 void MacroAssembler::tlab_allocate(Register thread, Register obj,
4280                                    Register var_size_in_bytes,
4281                                    int con_size_in_bytes,
4282                                    Register t1,
4283                                    Register t2,
4284                                    Label& slow_case) {
4285   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4286   bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4287 }
4288 
4289 // Defines obj, preserves var_size_in_bytes
4290 void MacroAssembler::eden_allocate(Register thread, Register obj,
4291                                    Register var_size_in_bytes,
4292                                    int con_size_in_bytes,
4293                                    Register t1,
4294                                    Label& slow_case) {
4295   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4296   bs->eden_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4297 }
4298 
4299 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
4300 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
4301   assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
4302   assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
4303   Label done;
4304 
4305   testptr(length_in_bytes, length_in_bytes);
4306   jcc(Assembler::zero, done);
4307 
4308   // initialize topmost word, divide index by 2, check if odd and test if zero
4309   // note: for the remaining code to work, index must be a multiple of BytesPerWord
4310 #ifdef ASSERT
4311   {
4312     Label L;
4313     testptr(length_in_bytes, BytesPerWord - 1);
4314     jcc(Assembler::zero, L);
4315     stop("length must be a multiple of BytesPerWord");
4316     bind(L);
4317   }
4318 #endif
4319   Register index = length_in_bytes;
4320   xorptr(temp, temp);    // use _zero reg to clear memory (shorter code)
4321   if (UseIncDec) {
4322     shrptr(index, 3);  // divide by 8/16 and set carry flag if bit 2 was set
4323   } else {
4324     shrptr(index, 2);  // use 2 instructions to avoid partial flag stall
4325     shrptr(index, 1);
4326   }
4327 #ifndef _LP64
4328   // index could have not been a multiple of 8 (i.e., bit 2 was set)
4329   {
4330     Label even;
4331     // note: if index was a multiple of 8, then it cannot
4332     //       be 0 now otherwise it must have been 0 before
4333     //       => if it is even, we don't need to check for 0 again
4334     jcc(Assembler::carryClear, even);
4335     // clear topmost word (no jump would be needed if conditional assignment worked here)
4336     movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
4337     // index could be 0 now, must check again
4338     jcc(Assembler::zero, done);
4339     bind(even);
4340   }
4341 #endif // !_LP64
4342   // initialize remaining object fields: index is a multiple of 2 now
4343   {
4344     Label loop;
4345     bind(loop);
4346     movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
4347     NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
4348     decrement(index);
4349     jcc(Assembler::notZero, loop);
4350   }
4351 
4352   bind(done);
4353 }
4354 
4355 // Look up the method for a megamorphic invokeinterface call.
4356 // The target method is determined by <intf_klass, itable_index>.
4357 // The receiver klass is in recv_klass.
4358 // On success, the result will be in method_result, and execution falls through.
4359 // On failure, execution transfers to the given label.
4360 void MacroAssembler::lookup_interface_method(Register recv_klass,
4361                                              Register intf_klass,
4362                                              RegisterOrConstant itable_index,
4363                                              Register method_result,
4364                                              Register scan_temp,
4365                                              Label& L_no_such_interface,
4366                                              bool return_method) {
4367   assert_different_registers(recv_klass, intf_klass, scan_temp);
4368   assert_different_registers(method_result, intf_klass, scan_temp);
4369   assert(recv_klass != method_result || !return_method,
4370          "recv_klass can be destroyed when method isn't needed");
4371 
4372   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
4373          "caller must use same register for non-constant itable index as for method");
4374 
4375   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
4376   int vtable_base = in_bytes(Klass::vtable_start_offset());
4377   int itentry_off = itableMethodEntry::method_offset_in_bytes();
4378   int scan_step   = itableOffsetEntry::size() * wordSize;
4379   int vte_size    = vtableEntry::size_in_bytes();
4380   Address::ScaleFactor times_vte_scale = Address::times_ptr;
4381   assert(vte_size == wordSize, "else adjust times_vte_scale");
4382 
4383   movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
4384 
4385   // %%% Could store the aligned, prescaled offset in the klassoop.
4386   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
4387 
4388   if (return_method) {
4389     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
4390     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
4391     lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
4392   }
4393 
4394   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
4395   //   if (scan->interface() == intf) {
4396   //     result = (klass + scan->offset() + itable_index);
4397   //   }
4398   // }
4399   Label search, found_method;
4400 
4401   for (int peel = 1; peel >= 0; peel--) {
4402     movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
4403     cmpptr(intf_klass, method_result);
4404 
4405     if (peel) {
4406       jccb(Assembler::equal, found_method);
4407     } else {
4408       jccb(Assembler::notEqual, search);
4409       // (invert the test to fall through to found_method...)
4410     }
4411 
4412     if (!peel)  break;
4413 
4414     bind(search);
4415 
4416     // Check that the previous entry is non-null.  A null entry means that
4417     // the receiver class doesn't implement the interface, and wasn't the
4418     // same as when the caller was compiled.
4419     testptr(method_result, method_result);
4420     jcc(Assembler::zero, L_no_such_interface);
4421     addptr(scan_temp, scan_step);
4422   }
4423 
4424   bind(found_method);
4425 
4426   if (return_method) {
4427     // Got a hit.
4428     movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
4429     movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
4430   }
4431 }
4432 
4433 
4434 // virtual method calling
4435 void MacroAssembler::lookup_virtual_method(Register recv_klass,
4436                                            RegisterOrConstant vtable_index,
4437                                            Register method_result) {
4438   const int base = in_bytes(Klass::vtable_start_offset());
4439   assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
4440   Address vtable_entry_addr(recv_klass,
4441                             vtable_index, Address::times_ptr,
4442                             base + vtableEntry::method_offset_in_bytes());
4443   movptr(method_result, vtable_entry_addr);
4444 }
4445 
4446 
4447 void MacroAssembler::check_klass_subtype(Register sub_klass,
4448                            Register super_klass,
4449                            Register temp_reg,
4450                            Label& L_success) {
4451   Label L_failure;
4452   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
4453   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
4454   bind(L_failure);
4455 }
4456 
4457 
4458 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4459                                                    Register super_klass,
4460                                                    Register temp_reg,
4461                                                    Label* L_success,
4462                                                    Label* L_failure,
4463                                                    Label* L_slow_path,
4464                                         RegisterOrConstant super_check_offset) {
4465   assert_different_registers(sub_klass, super_klass, temp_reg);
4466   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
4467   if (super_check_offset.is_register()) {
4468     assert_different_registers(sub_klass, super_klass,
4469                                super_check_offset.as_register());
4470   } else if (must_load_sco) {
4471     assert(temp_reg != noreg, "supply either a temp or a register offset");
4472   }
4473 
4474   Label L_fallthrough;
4475   int label_nulls = 0;
4476   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
4477   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
4478   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
4479   assert(label_nulls <= 1, "at most one NULL in the batch");
4480 
4481   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4482   int sco_offset = in_bytes(Klass::super_check_offset_offset());
4483   Address super_check_offset_addr(super_klass, sco_offset);
4484 
4485   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
4486   // range of a jccb.  If this routine grows larger, reconsider at
4487   // least some of these.
4488 #define local_jcc(assembler_cond, label)                                \
4489   if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
4490   else                             jcc( assembler_cond, label) /*omit semi*/
4491 
4492   // Hacked jmp, which may only be used just before L_fallthrough.
4493 #define final_jmp(label)                                                \
4494   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
4495   else                            jmp(label)                /*omit semi*/
4496 
4497   // If the pointers are equal, we are done (e.g., String[] elements).
4498   // This self-check enables sharing of secondary supertype arrays among
4499   // non-primary types such as array-of-interface.  Otherwise, each such
4500   // type would need its own customized SSA.
4501   // We move this check to the front of the fast path because many
4502   // type checks are in fact trivially successful in this manner,
4503   // so we get a nicely predicted branch right at the start of the check.
4504   cmpptr(sub_klass, super_klass);
4505   local_jcc(Assembler::equal, *L_success);
4506 
4507   // Check the supertype display:
4508   if (must_load_sco) {
4509     // Positive movl does right thing on LP64.
4510     movl(temp_reg, super_check_offset_addr);
4511     super_check_offset = RegisterOrConstant(temp_reg);
4512   }
4513   Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
4514   cmpptr(super_klass, super_check_addr); // load displayed supertype
4515 
4516   // This check has worked decisively for primary supers.
4517   // Secondary supers are sought in the super_cache ('super_cache_addr').
4518   // (Secondary supers are interfaces and very deeply nested subtypes.)
4519   // This works in the same check above because of a tricky aliasing
4520   // between the super_cache and the primary super display elements.
4521   // (The 'super_check_addr' can address either, as the case requires.)
4522   // Note that the cache is updated below if it does not help us find
4523   // what we need immediately.
4524   // So if it was a primary super, we can just fail immediately.
4525   // Otherwise, it's the slow path for us (no success at this point).
4526 
4527   if (super_check_offset.is_register()) {
4528     local_jcc(Assembler::equal, *L_success);
4529     cmpl(super_check_offset.as_register(), sc_offset);
4530     if (L_failure == &L_fallthrough) {
4531       local_jcc(Assembler::equal, *L_slow_path);
4532     } else {
4533       local_jcc(Assembler::notEqual, *L_failure);
4534       final_jmp(*L_slow_path);
4535     }
4536   } else if (super_check_offset.as_constant() == sc_offset) {
4537     // Need a slow path; fast failure is impossible.
4538     if (L_slow_path == &L_fallthrough) {
4539       local_jcc(Assembler::equal, *L_success);
4540     } else {
4541       local_jcc(Assembler::notEqual, *L_slow_path);
4542       final_jmp(*L_success);
4543     }
4544   } else {
4545     // No slow path; it's a fast decision.
4546     if (L_failure == &L_fallthrough) {
4547       local_jcc(Assembler::equal, *L_success);
4548     } else {
4549       local_jcc(Assembler::notEqual, *L_failure);
4550       final_jmp(*L_success);
4551     }
4552   }
4553 
4554   bind(L_fallthrough);
4555 
4556 #undef local_jcc
4557 #undef final_jmp
4558 }
4559 
4560 
4561 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4562                                                    Register super_klass,
4563                                                    Register temp_reg,
4564                                                    Register temp2_reg,
4565                                                    Label* L_success,
4566                                                    Label* L_failure,
4567                                                    bool set_cond_codes) {
4568   assert_different_registers(sub_klass, super_klass, temp_reg);
4569   if (temp2_reg != noreg)
4570     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
4571 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
4572 
4573   Label L_fallthrough;
4574   int label_nulls = 0;
4575   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
4576   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
4577   assert(label_nulls <= 1, "at most one NULL in the batch");
4578 
4579   // a couple of useful fields in sub_klass:
4580   int ss_offset = in_bytes(Klass::secondary_supers_offset());
4581   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4582   Address secondary_supers_addr(sub_klass, ss_offset);
4583   Address super_cache_addr(     sub_klass, sc_offset);
4584 
4585   // Do a linear scan of the secondary super-klass chain.
4586   // This code is rarely used, so simplicity is a virtue here.
4587   // The repne_scan instruction uses fixed registers, which we must spill.
4588   // Don't worry too much about pre-existing connections with the input regs.
4589 
4590   assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
4591   assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
4592 
4593   // Get super_klass value into rax (even if it was in rdi or rcx).
4594   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
4595   if (super_klass != rax || UseCompressedOops) {
4596     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
4597     mov(rax, super_klass);
4598   }
4599   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
4600   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
4601 
4602 #ifndef PRODUCT
4603   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
4604   ExternalAddress pst_counter_addr((address) pst_counter);
4605   NOT_LP64(  incrementl(pst_counter_addr) );
4606   LP64_ONLY( lea(rcx, pst_counter_addr) );
4607   LP64_ONLY( incrementl(Address(rcx, 0)) );
4608 #endif //PRODUCT
4609 
4610   // We will consult the secondary-super array.
4611   movptr(rdi, secondary_supers_addr);
4612   // Load the array length.  (Positive movl does right thing on LP64.)
4613   movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
4614   // Skip to start of data.
4615   addptr(rdi, Array<Klass*>::base_offset_in_bytes());
4616 
4617   // Scan RCX words at [RDI] for an occurrence of RAX.
4618   // Set NZ/Z based on last compare.
4619   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
4620   // not change flags (only scas instruction which is repeated sets flags).
4621   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
4622 
4623     testptr(rax,rax); // Set Z = 0
4624     repne_scan();
4625 
4626   // Unspill the temp. registers:
4627   if (pushed_rdi)  pop(rdi);
4628   if (pushed_rcx)  pop(rcx);
4629   if (pushed_rax)  pop(rax);
4630 
4631   if (set_cond_codes) {
4632     // Special hack for the AD files:  rdi is guaranteed non-zero.
4633     assert(!pushed_rdi, "rdi must be left non-NULL");
4634     // Also, the condition codes are properly set Z/NZ on succeed/failure.
4635   }
4636 
4637   if (L_failure == &L_fallthrough)
4638         jccb(Assembler::notEqual, *L_failure);
4639   else  jcc(Assembler::notEqual, *L_failure);
4640 
4641   // Success.  Cache the super we found and proceed in triumph.
4642   movptr(super_cache_addr, super_klass);
4643 
4644   if (L_success != &L_fallthrough) {
4645     jmp(*L_success);
4646   }
4647 
4648 #undef IS_A_TEMP
4649 
4650   bind(L_fallthrough);
4651 }
4652 
4653 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
4654   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
4655 
4656   Label L_fallthrough;
4657   if (L_fast_path == NULL) {
4658     L_fast_path = &L_fallthrough;
4659   } else if (L_slow_path == NULL) {
4660     L_slow_path = &L_fallthrough;
4661   }
4662 
4663   // Fast path check: class is fully initialized
4664   cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
4665   jcc(Assembler::equal, *L_fast_path);
4666 
4667   // Fast path check: current thread is initializer thread
4668   cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset()));
4669   if (L_slow_path == &L_fallthrough) {
4670     jcc(Assembler::equal, *L_fast_path);
4671     bind(*L_slow_path);
4672   } else if (L_fast_path == &L_fallthrough) {
4673     jcc(Assembler::notEqual, *L_slow_path);
4674     bind(*L_fast_path);
4675   } else {
4676     Unimplemented();
4677   }
4678 }
4679 
4680 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
4681   if (VM_Version::supports_cmov()) {
4682     cmovl(cc, dst, src);
4683   } else {
4684     Label L;
4685     jccb(negate_condition(cc), L);
4686     movl(dst, src);
4687     bind(L);
4688   }
4689 }
4690 
4691 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
4692   if (VM_Version::supports_cmov()) {
4693     cmovl(cc, dst, src);
4694   } else {
4695     Label L;
4696     jccb(negate_condition(cc), L);
4697     movl(dst, src);
4698     bind(L);
4699   }
4700 }
4701 
4702 void MacroAssembler::verify_oop(Register reg, const char* s) {
4703   if (!VerifyOops || VerifyAdapterSharing) {
4704     // Below address of the code string confuses VerifyAdapterSharing
4705     // because it may differ between otherwise equivalent adapters.
4706     return;
4707   }
4708 
4709   // Pass register number to verify_oop_subroutine
4710   const char* b = NULL;
4711   {
4712     ResourceMark rm;
4713     stringStream ss;
4714     ss.print("verify_oop: %s: %s", reg->name(), s);
4715     b = code_string(ss.as_string());
4716   }
4717   BLOCK_COMMENT("verify_oop {");
4718 #ifdef _LP64
4719   push(rscratch1);                    // save r10, trashed by movptr()
4720 #endif
4721   push(rax);                          // save rax,
4722   push(reg);                          // pass register argument
4723   ExternalAddress buffer((address) b);
4724   // avoid using pushptr, as it modifies scratch registers
4725   // and our contract is not to modify anything
4726   movptr(rax, buffer.addr());
4727   push(rax);
4728   // call indirectly to solve generation ordering problem
4729   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4730   call(rax);
4731   // Caller pops the arguments (oop, message) and restores rax, r10
4732   BLOCK_COMMENT("} verify_oop");
4733 }
4734 
4735 
4736 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
4737                                                       Register tmp,
4738                                                       int offset) {
4739   intptr_t value = *delayed_value_addr;
4740   if (value != 0)
4741     return RegisterOrConstant(value + offset);
4742 
4743   // load indirectly to solve generation ordering problem
4744   movptr(tmp, ExternalAddress((address) delayed_value_addr));
4745 
4746 #ifdef ASSERT
4747   { Label L;
4748     testptr(tmp, tmp);
4749     if (WizardMode) {
4750       const char* buf = NULL;
4751       {
4752         ResourceMark rm;
4753         stringStream ss;
4754         ss.print("DelayedValue=" INTPTR_FORMAT, delayed_value_addr[1]);
4755         buf = code_string(ss.as_string());
4756       }
4757       jcc(Assembler::notZero, L);
4758       STOP(buf);
4759     } else {
4760       jccb(Assembler::notZero, L);
4761       hlt();
4762     }
4763     bind(L);
4764   }
4765 #endif
4766 
4767   if (offset != 0)
4768     addptr(tmp, offset);
4769 
4770   return RegisterOrConstant(tmp);
4771 }
4772 
4773 
4774 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
4775                                          int extra_slot_offset) {
4776   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
4777   int stackElementSize = Interpreter::stackElementSize;
4778   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
4779 #ifdef ASSERT
4780   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
4781   assert(offset1 - offset == stackElementSize, "correct arithmetic");
4782 #endif
4783   Register             scale_reg    = noreg;
4784   Address::ScaleFactor scale_factor = Address::no_scale;
4785   if (arg_slot.is_constant()) {
4786     offset += arg_slot.as_constant() * stackElementSize;
4787   } else {
4788     scale_reg    = arg_slot.as_register();
4789     scale_factor = Address::times(stackElementSize);
4790   }
4791   offset += wordSize;           // return PC is on stack
4792   return Address(rsp, scale_reg, scale_factor, offset);
4793 }
4794 
4795 
4796 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
4797   if (!VerifyOops || VerifyAdapterSharing) {
4798     // Below address of the code string confuses VerifyAdapterSharing
4799     // because it may differ between otherwise equivalent adapters.
4800     return;
4801   }
4802 
4803   // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
4804   // Pass register number to verify_oop_subroutine
4805   const char* b = NULL;
4806   {
4807     ResourceMark rm;
4808     stringStream ss;
4809     ss.print("verify_oop_addr: %s", s);
4810     b = code_string(ss.as_string());
4811   }
4812 #ifdef _LP64
4813   push(rscratch1);                    // save r10, trashed by movptr()
4814 #endif
4815   push(rax);                          // save rax,
4816   // addr may contain rsp so we will have to adjust it based on the push
4817   // we just did (and on 64 bit we do two pushes)
4818   // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
4819   // stores rax into addr which is backwards of what was intended.
4820   if (addr.uses(rsp)) {
4821     lea(rax, addr);
4822     pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
4823   } else {
4824     pushptr(addr);
4825   }
4826 
4827   ExternalAddress buffer((address) b);
4828   // pass msg argument
4829   // avoid using pushptr, as it modifies scratch registers
4830   // and our contract is not to modify anything
4831   movptr(rax, buffer.addr());
4832   push(rax);
4833 
4834   // call indirectly to solve generation ordering problem
4835   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4836   call(rax);
4837   // Caller pops the arguments (addr, message) and restores rax, r10.
4838 }
4839 
4840 void MacroAssembler::verify_tlab() {
4841 #ifdef ASSERT
4842   if (UseTLAB && VerifyOops) {
4843     Label next, ok;
4844     Register t1 = rsi;
4845     Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
4846 
4847     push(t1);
4848     NOT_LP64(push(thread_reg));
4849     NOT_LP64(get_thread(thread_reg));
4850 
4851     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4852     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
4853     jcc(Assembler::aboveEqual, next);
4854     STOP("assert(top >= start)");
4855     should_not_reach_here();
4856 
4857     bind(next);
4858     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
4859     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4860     jcc(Assembler::aboveEqual, ok);
4861     STOP("assert(top <= end)");
4862     should_not_reach_here();
4863 
4864     bind(ok);
4865     NOT_LP64(pop(thread_reg));
4866     pop(t1);
4867   }
4868 #endif
4869 }
4870 
4871 class ControlWord {
4872  public:
4873   int32_t _value;
4874 
4875   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
4876   int  precision_control() const       { return  (_value >>  8) & 3      ; }
4877   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4878   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4879   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4880   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4881   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4882   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4883 
4884   void print() const {
4885     // rounding control
4886     const char* rc;
4887     switch (rounding_control()) {
4888       case 0: rc = "round near"; break;
4889       case 1: rc = "round down"; break;
4890       case 2: rc = "round up  "; break;
4891       case 3: rc = "chop      "; break;
4892     };
4893     // precision control
4894     const char* pc;
4895     switch (precision_control()) {
4896       case 0: pc = "24 bits "; break;
4897       case 1: pc = "reserved"; break;
4898       case 2: pc = "53 bits "; break;
4899       case 3: pc = "64 bits "; break;
4900     };
4901     // flags
4902     char f[9];
4903     f[0] = ' ';
4904     f[1] = ' ';
4905     f[2] = (precision   ()) ? 'P' : 'p';
4906     f[3] = (underflow   ()) ? 'U' : 'u';
4907     f[4] = (overflow    ()) ? 'O' : 'o';
4908     f[5] = (zero_divide ()) ? 'Z' : 'z';
4909     f[6] = (denormalized()) ? 'D' : 'd';
4910     f[7] = (invalid     ()) ? 'I' : 'i';
4911     f[8] = '\x0';
4912     // output
4913     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
4914   }
4915 
4916 };
4917 
4918 class StatusWord {
4919  public:
4920   int32_t _value;
4921 
4922   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
4923   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
4924   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
4925   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
4926   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
4927   int  top() const                     { return  (_value >> 11) & 7      ; }
4928   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
4929   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
4930   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4931   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4932   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4933   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4934   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4935   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4936 
4937   void print() const {
4938     // condition codes
4939     char c[5];
4940     c[0] = (C3()) ? '3' : '-';
4941     c[1] = (C2()) ? '2' : '-';
4942     c[2] = (C1()) ? '1' : '-';
4943     c[3] = (C0()) ? '0' : '-';
4944     c[4] = '\x0';
4945     // flags
4946     char f[9];
4947     f[0] = (error_status()) ? 'E' : '-';
4948     f[1] = (stack_fault ()) ? 'S' : '-';
4949     f[2] = (precision   ()) ? 'P' : '-';
4950     f[3] = (underflow   ()) ? 'U' : '-';
4951     f[4] = (overflow    ()) ? 'O' : '-';
4952     f[5] = (zero_divide ()) ? 'Z' : '-';
4953     f[6] = (denormalized()) ? 'D' : '-';
4954     f[7] = (invalid     ()) ? 'I' : '-';
4955     f[8] = '\x0';
4956     // output
4957     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
4958   }
4959 
4960 };
4961 
4962 class TagWord {
4963  public:
4964   int32_t _value;
4965 
4966   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
4967 
4968   void print() const {
4969     printf("%04x", _value & 0xFFFF);
4970   }
4971 
4972 };
4973 
4974 class FPU_Register {
4975  public:
4976   int32_t _m0;
4977   int32_t _m1;
4978   int16_t _ex;
4979 
4980   bool is_indefinite() const           {
4981     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
4982   }
4983 
4984   void print() const {
4985     char  sign = (_ex < 0) ? '-' : '+';
4986     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
4987     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
4988   };
4989 
4990 };
4991 
4992 class FPU_State {
4993  public:
4994   enum {
4995     register_size       = 10,
4996     number_of_registers =  8,
4997     register_mask       =  7
4998   };
4999 
5000   ControlWord  _control_word;
5001   StatusWord   _status_word;
5002   TagWord      _tag_word;
5003   int32_t      _error_offset;
5004   int32_t      _error_selector;
5005   int32_t      _data_offset;
5006   int32_t      _data_selector;
5007   int8_t       _register[register_size * number_of_registers];
5008 
5009   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
5010   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
5011 
5012   const char* tag_as_string(int tag) const {
5013     switch (tag) {
5014       case 0: return "valid";
5015       case 1: return "zero";
5016       case 2: return "special";
5017       case 3: return "empty";
5018     }
5019     ShouldNotReachHere();
5020     return NULL;
5021   }
5022 
5023   void print() const {
5024     // print computation registers
5025     { int t = _status_word.top();
5026       for (int i = 0; i < number_of_registers; i++) {
5027         int j = (i - t) & register_mask;
5028         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
5029         st(j)->print();
5030         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
5031       }
5032     }
5033     printf("\n");
5034     // print control registers
5035     printf("ctrl = "); _control_word.print(); printf("\n");
5036     printf("stat = "); _status_word .print(); printf("\n");
5037     printf("tags = "); _tag_word    .print(); printf("\n");
5038   }
5039 
5040 };
5041 
5042 class Flag_Register {
5043  public:
5044   int32_t _value;
5045 
5046   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
5047   bool direction() const               { return ((_value >> 10) & 1) != 0; }
5048   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
5049   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
5050   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
5051   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
5052   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
5053 
5054   void print() const {
5055     // flags
5056     char f[8];
5057     f[0] = (overflow       ()) ? 'O' : '-';
5058     f[1] = (direction      ()) ? 'D' : '-';
5059     f[2] = (sign           ()) ? 'S' : '-';
5060     f[3] = (zero           ()) ? 'Z' : '-';
5061     f[4] = (auxiliary_carry()) ? 'A' : '-';
5062     f[5] = (parity         ()) ? 'P' : '-';
5063     f[6] = (carry          ()) ? 'C' : '-';
5064     f[7] = '\x0';
5065     // output
5066     printf("%08x  flags = %s", _value, f);
5067   }
5068 
5069 };
5070 
5071 class IU_Register {
5072  public:
5073   int32_t _value;
5074 
5075   void print() const {
5076     printf("%08x  %11d", _value, _value);
5077   }
5078 
5079 };
5080 
5081 class IU_State {
5082  public:
5083   Flag_Register _eflags;
5084   IU_Register   _rdi;
5085   IU_Register   _rsi;
5086   IU_Register   _rbp;
5087   IU_Register   _rsp;
5088   IU_Register   _rbx;
5089   IU_Register   _rdx;
5090   IU_Register   _rcx;
5091   IU_Register   _rax;
5092 
5093   void print() const {
5094     // computation registers
5095     printf("rax,  = "); _rax.print(); printf("\n");
5096     printf("rbx,  = "); _rbx.print(); printf("\n");
5097     printf("rcx  = "); _rcx.print(); printf("\n");
5098     printf("rdx  = "); _rdx.print(); printf("\n");
5099     printf("rdi  = "); _rdi.print(); printf("\n");
5100     printf("rsi  = "); _rsi.print(); printf("\n");
5101     printf("rbp,  = "); _rbp.print(); printf("\n");
5102     printf("rsp  = "); _rsp.print(); printf("\n");
5103     printf("\n");
5104     // control registers
5105     printf("flgs = "); _eflags.print(); printf("\n");
5106   }
5107 };
5108 
5109 
5110 class CPU_State {
5111  public:
5112   FPU_State _fpu_state;
5113   IU_State  _iu_state;
5114 
5115   void print() const {
5116     printf("--------------------------------------------------\n");
5117     _iu_state .print();
5118     printf("\n");
5119     _fpu_state.print();
5120     printf("--------------------------------------------------\n");
5121   }
5122 
5123 };
5124 
5125 
5126 static void _print_CPU_state(CPU_State* state) {
5127   state->print();
5128 };
5129 
5130 
5131 void MacroAssembler::print_CPU_state() {
5132   push_CPU_state();
5133   push(rsp);                // pass CPU state
5134   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
5135   addptr(rsp, wordSize);       // discard argument
5136   pop_CPU_state();
5137 }
5138 
5139 
5140 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
5141   static int counter = 0;
5142   FPU_State* fs = &state->_fpu_state;
5143   counter++;
5144   // For leaf calls, only verify that the top few elements remain empty.
5145   // We only need 1 empty at the top for C2 code.
5146   if( stack_depth < 0 ) {
5147     if( fs->tag_for_st(7) != 3 ) {
5148       printf("FPR7 not empty\n");
5149       state->print();
5150       assert(false, "error");
5151       return false;
5152     }
5153     return true;                // All other stack states do not matter
5154   }
5155 
5156   assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
5157          "bad FPU control word");
5158 
5159   // compute stack depth
5160   int i = 0;
5161   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
5162   int d = i;
5163   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
5164   // verify findings
5165   if (i != FPU_State::number_of_registers) {
5166     // stack not contiguous
5167     printf("%s: stack not contiguous at ST%d\n", s, i);
5168     state->print();
5169     assert(false, "error");
5170     return false;
5171   }
5172   // check if computed stack depth corresponds to expected stack depth
5173   if (stack_depth < 0) {
5174     // expected stack depth is -stack_depth or less
5175     if (d > -stack_depth) {
5176       // too many elements on the stack
5177       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
5178       state->print();
5179       assert(false, "error");
5180       return false;
5181     }
5182   } else {
5183     // expected stack depth is stack_depth
5184     if (d != stack_depth) {
5185       // wrong stack depth
5186       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
5187       state->print();
5188       assert(false, "error");
5189       return false;
5190     }
5191   }
5192   // everything is cool
5193   return true;
5194 }
5195 
5196 
5197 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
5198   if (!VerifyFPU) return;
5199   push_CPU_state();
5200   push(rsp);                // pass CPU state
5201   ExternalAddress msg((address) s);
5202   // pass message string s
5203   pushptr(msg.addr());
5204   push(stack_depth);        // pass stack depth
5205   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
5206   addptr(rsp, 3 * wordSize);   // discard arguments
5207   // check for error
5208   { Label L;
5209     testl(rax, rax);
5210     jcc(Assembler::notZero, L);
5211     int3();                  // break if error condition
5212     bind(L);
5213   }
5214   pop_CPU_state();
5215 }
5216 
5217 void MacroAssembler::restore_cpu_control_state_after_jni() {
5218   // Either restore the MXCSR register after returning from the JNI Call
5219   // or verify that it wasn't changed (with -Xcheck:jni flag).
5220   if (VM_Version::supports_sse()) {
5221     if (RestoreMXCSROnJNICalls) {
5222       ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
5223     } else if (CheckJNICalls) {
5224       call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
5225     }
5226   }
5227   // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
5228   vzeroupper();
5229   // Reset k1 to 0xffff.
5230 
5231 #ifdef COMPILER2
5232   if (PostLoopMultiversioning && VM_Version::supports_evex()) {
5233     push(rcx);
5234     movl(rcx, 0xffff);
5235     kmovwl(k1, rcx);
5236     pop(rcx);
5237   }
5238 #endif // COMPILER2
5239 
5240 #ifndef _LP64
5241   // Either restore the x87 floating pointer control word after returning
5242   // from the JNI call or verify that it wasn't changed.
5243   if (CheckJNICalls) {
5244     call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
5245   }
5246 #endif // _LP64
5247 }
5248 
5249 // ((OopHandle)result).resolve();
5250 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
5251   assert_different_registers(result, tmp);
5252 
5253   // Only 64 bit platforms support GCs that require a tmp register
5254   // Only IN_HEAP loads require a thread_tmp register
5255   // OopHandle::resolve is an indirection like jobject.
5256   access_load_at(T_OBJECT, IN_NATIVE,
5257                  result, Address(result, 0), tmp, /*tmp_thread*/noreg);
5258 }
5259 
5260 // ((WeakHandle)result).resolve();
5261 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
5262   assert_different_registers(rresult, rtmp);
5263   Label resolved;
5264 
5265   // A null weak handle resolves to null.
5266   cmpptr(rresult, 0);
5267   jcc(Assembler::equal, resolved);
5268 
5269   // Only 64 bit platforms support GCs that require a tmp register
5270   // Only IN_HEAP loads require a thread_tmp register
5271   // WeakHandle::resolve is an indirection like jweak.
5272   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
5273                  rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg);
5274   bind(resolved);
5275 }
5276 
5277 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
5278   // get mirror
5279   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
5280   load_method_holder(mirror, method);
5281   movptr(mirror, Address(mirror, mirror_offset));
5282   resolve_oop_handle(mirror, tmp);
5283 }
5284 
5285 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
5286   load_method_holder(rresult, rmethod);
5287   movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
5288 }
5289 
5290 void MacroAssembler::load_metadata(Register dst, Register src) {
5291   if (UseCompressedClassPointers) {
5292     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5293   } else {
5294     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5295   }
5296 }
5297 
5298 void MacroAssembler::load_storage_props(Register dst, Register src) {
5299   load_metadata(dst, src);
5300   if (UseCompressedClassPointers) {
5301     shrl(dst, oopDesc::narrow_storage_props_shift);
5302   } else {
5303     shrq(dst, oopDesc::wide_storage_props_shift);
5304   }
5305 }
5306 
5307 void MacroAssembler::load_method_holder(Register holder, Register method) {
5308   movptr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
5309   movptr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
5310   movptr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
5311 }
5312 
5313 void MacroAssembler::load_klass(Register dst, Register src) {
5314   load_metadata(dst, src);
5315 #ifdef _LP64
5316   if (UseCompressedClassPointers) {
5317     andl(dst, oopDesc::compressed_klass_mask());
5318     decode_klass_not_null(dst);
5319   } else
5320 #endif
5321   {
5322 #ifdef _LP64
5323     shlq(dst, oopDesc::storage_props_nof_bits);
5324     shrq(dst, oopDesc::storage_props_nof_bits);
5325 #else
5326     andl(dst, oopDesc::wide_klass_mask());
5327 #endif
5328   }
5329 }
5330 
5331 void MacroAssembler::load_prototype_header(Register dst, Register src) {
5332   load_klass(dst, src);
5333   movptr(dst, Address(dst, Klass::prototype_header_offset()));
5334 }
5335 
5336 void MacroAssembler::store_klass(Register dst, Register src) {
5337 #ifdef _LP64
5338   if (UseCompressedClassPointers) {
5339     encode_klass_not_null(src);
5340     movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5341   } else
5342 #endif
5343     movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5344 }
5345 
5346 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
5347                                     Register tmp1, Register thread_tmp) {
5348   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5349   decorators = AccessInternal::decorator_fixup(decorators);
5350   bool as_raw = (decorators & AS_RAW) != 0;
5351   if (as_raw) {
5352     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
5353   } else {
5354     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
5355   }
5356 }
5357 
5358 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src,
5359                                      Register tmp1, Register tmp2, Register tmp3) {
5360   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5361   decorators = AccessInternal::decorator_fixup(decorators);
5362   bool as_raw = (decorators & AS_RAW) != 0;
5363   if (as_raw) {
5364     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, tmp2, tmp3);
5365   } else {
5366     bs->store_at(this, decorators, type, dst, src, tmp1, tmp2, tmp3);
5367   }
5368 }
5369 
5370 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
5371   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
5372   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
5373     decorators |= ACCESS_READ | ACCESS_WRITE;
5374   }
5375   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5376   return bs->resolve(this, decorators, obj);
5377 }
5378 
5379 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
5380                                    Register thread_tmp, DecoratorSet decorators) {
5381   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
5382 }
5383 
5384 // Doesn't do verfication, generates fixed size code
5385 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
5386                                             Register thread_tmp, DecoratorSet decorators) {
5387   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
5388 }
5389 
5390 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
5391                                     Register tmp2, Register tmp3, DecoratorSet decorators) {
5392   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2, tmp3);
5393 }
5394 
5395 // Used for storing NULLs.
5396 void MacroAssembler::store_heap_oop_null(Address dst) {
5397   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
5398 }
5399 
5400 #ifdef _LP64
5401 void MacroAssembler::store_klass_gap(Register dst, Register src) {
5402   if (UseCompressedClassPointers) {
5403     // Store to klass gap in destination
5404     movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
5405   }
5406 }
5407 
5408 #ifdef ASSERT
5409 void MacroAssembler::verify_heapbase(const char* msg) {
5410   assert (UseCompressedOops, "should be compressed");
5411   assert (Universe::heap() != NULL, "java heap should be initialized");
5412   if (CheckCompressedOops) {
5413     Label ok;
5414     push(rscratch1); // cmpptr trashes rscratch1
5415     cmpptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
5416     jcc(Assembler::equal, ok);
5417     STOP(msg);
5418     bind(ok);
5419     pop(rscratch1);
5420   }
5421 }
5422 #endif
5423 
5424 // Algorithm must match oop.inline.hpp encode_heap_oop.
5425 void MacroAssembler::encode_heap_oop(Register r) {
5426 #ifdef ASSERT
5427   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
5428 #endif
5429   verify_oop(r, "broken oop in encode_heap_oop");
5430   if (CompressedOops::base() == NULL) {
5431     if (CompressedOops::shift() != 0) {
5432       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5433       shrq(r, LogMinObjAlignmentInBytes);
5434     }
5435     return;
5436   }
5437   testq(r, r);
5438   cmovq(Assembler::equal, r, r12_heapbase);
5439   subq(r, r12_heapbase);
5440   shrq(r, LogMinObjAlignmentInBytes);
5441 }
5442 
5443 void MacroAssembler::encode_heap_oop_not_null(Register r) {
5444 #ifdef ASSERT
5445   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
5446   if (CheckCompressedOops) {
5447     Label ok;
5448     testq(r, r);
5449     jcc(Assembler::notEqual, ok);
5450     STOP("null oop passed to encode_heap_oop_not_null");
5451     bind(ok);
5452   }
5453 #endif
5454   verify_oop(r, "broken oop in encode_heap_oop_not_null");
5455   if (CompressedOops::base() != NULL) {
5456     subq(r, r12_heapbase);
5457   }
5458   if (CompressedOops::shift() != 0) {
5459     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5460     shrq(r, LogMinObjAlignmentInBytes);
5461   }
5462 }
5463 
5464 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
5465 #ifdef ASSERT
5466   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
5467   if (CheckCompressedOops) {
5468     Label ok;
5469     testq(src, src);
5470     jcc(Assembler::notEqual, ok);
5471     STOP("null oop passed to encode_heap_oop_not_null2");
5472     bind(ok);
5473   }
5474 #endif
5475   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
5476   if (dst != src) {
5477     movq(dst, src);
5478   }
5479   if (CompressedOops::base() != NULL) {
5480     subq(dst, r12_heapbase);
5481   }
5482   if (CompressedOops::shift() != 0) {
5483     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5484     shrq(dst, LogMinObjAlignmentInBytes);
5485   }
5486 }
5487 
5488 void  MacroAssembler::decode_heap_oop(Register r) {
5489 #ifdef ASSERT
5490   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
5491 #endif
5492   if (CompressedOops::base() == NULL) {
5493     if (CompressedOops::shift() != 0) {
5494       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5495       shlq(r, LogMinObjAlignmentInBytes);
5496     }
5497   } else {
5498     Label done;
5499     shlq(r, LogMinObjAlignmentInBytes);
5500     jccb(Assembler::equal, done);
5501     addq(r, r12_heapbase);
5502     bind(done);
5503   }
5504   verify_oop(r, "broken oop in decode_heap_oop");
5505 }
5506 
5507 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
5508   // Note: it will change flags
5509   assert (UseCompressedOops, "should only be used for compressed headers");
5510   assert (Universe::heap() != NULL, "java heap should be initialized");
5511   // Cannot assert, unverified entry point counts instructions (see .ad file)
5512   // vtableStubs also counts instructions in pd_code_size_limit.
5513   // Also do not verify_oop as this is called by verify_oop.
5514   if (CompressedOops::shift() != 0) {
5515     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5516     shlq(r, LogMinObjAlignmentInBytes);
5517     if (CompressedOops::base() != NULL) {
5518       addq(r, r12_heapbase);
5519     }
5520   } else {
5521     assert (CompressedOops::base() == NULL, "sanity");
5522   }
5523 }
5524 
5525 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
5526   // Note: it will change flags
5527   assert (UseCompressedOops, "should only be used for compressed headers");
5528   assert (Universe::heap() != NULL, "java heap should be initialized");
5529   // Cannot assert, unverified entry point counts instructions (see .ad file)
5530   // vtableStubs also counts instructions in pd_code_size_limit.
5531   // Also do not verify_oop as this is called by verify_oop.
5532   if (CompressedOops::shift() != 0) {
5533     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
5534     if (LogMinObjAlignmentInBytes == Address::times_8) {
5535       leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
5536     } else {
5537       if (dst != src) {
5538         movq(dst, src);
5539       }
5540       shlq(dst, LogMinObjAlignmentInBytes);
5541       if (CompressedOops::base() != NULL) {
5542         addq(dst, r12_heapbase);
5543       }
5544     }
5545   } else {
5546     assert (CompressedOops::base() == NULL, "sanity");
5547     if (dst != src) {
5548       movq(dst, src);
5549     }
5550   }
5551 }
5552 
5553 void MacroAssembler::encode_klass_not_null(Register r) {
5554   if (CompressedKlassPointers::base() != NULL) {
5555     // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
5556     assert(r != r12_heapbase, "Encoding a klass in r12");
5557     mov64(r12_heapbase, (int64_t)CompressedKlassPointers::base());
5558     subq(r, r12_heapbase);
5559   }
5560   if (CompressedKlassPointers::shift() != 0) {
5561     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5562     shrq(r, LogKlassAlignmentInBytes);
5563   }
5564   if (CompressedKlassPointers::base() != NULL) {
5565     reinit_heapbase();
5566   }
5567 }
5568 
5569 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
5570   if (dst == src) {
5571     encode_klass_not_null(src);
5572   } else {
5573     if (CompressedKlassPointers::base() != NULL) {
5574       mov64(dst, (int64_t)CompressedKlassPointers::base());
5575       negq(dst);
5576       addq(dst, src);
5577     } else {
5578       movptr(dst, src);
5579     }
5580     if (CompressedKlassPointers::shift() != 0) {
5581       assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5582       shrq(dst, LogKlassAlignmentInBytes);
5583     }
5584   }
5585 }
5586 
5587 // Function instr_size_for_decode_klass_not_null() counts the instructions
5588 // generated by decode_klass_not_null(register r) and reinit_heapbase(),
5589 // when (Universe::heap() != NULL).  Hence, if the instructions they
5590 // generate change, then this method needs to be updated.
5591 int MacroAssembler::instr_size_for_decode_klass_not_null() {
5592   assert (UseCompressedClassPointers, "only for compressed klass ptrs");
5593   if (CompressedKlassPointers::base() != NULL) {
5594     // mov64 + addq + shlq? + mov64  (for reinit_heapbase()).
5595     return (CompressedKlassPointers::shift() == 0 ? 20 : 24);
5596   } else {
5597     // longest load decode klass function, mov64, leaq
5598     return 16;
5599   }
5600 }
5601 
5602 // !!! If the instructions that get generated here change then function
5603 // instr_size_for_decode_klass_not_null() needs to get updated.
5604 void  MacroAssembler::decode_klass_not_null(Register r) {
5605   // Note: it will change flags
5606   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5607   assert(r != r12_heapbase, "Decoding a klass in r12");
5608   // Cannot assert, unverified entry point counts instructions (see .ad file)
5609   // vtableStubs also counts instructions in pd_code_size_limit.
5610   // Also do not verify_oop as this is called by verify_oop.
5611   if (CompressedKlassPointers::shift() != 0) {
5612     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5613     shlq(r, LogKlassAlignmentInBytes);
5614   }
5615   // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
5616   if (CompressedKlassPointers::base() != NULL) {
5617     mov64(r12_heapbase, (int64_t)CompressedKlassPointers::base());
5618     addq(r, r12_heapbase);
5619     reinit_heapbase();
5620   }
5621 }
5622 
5623 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
5624   // Note: it will change flags
5625   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5626   if (dst == src) {
5627     decode_klass_not_null(dst);
5628   } else {
5629     // Cannot assert, unverified entry point counts instructions (see .ad file)
5630     // vtableStubs also counts instructions in pd_code_size_limit.
5631     // Also do not verify_oop as this is called by verify_oop.
5632     mov64(dst, (int64_t)CompressedKlassPointers::base());
5633     if (CompressedKlassPointers::shift() != 0) {
5634       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
5635       assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
5636       leaq(dst, Address(dst, src, Address::times_8, 0));
5637     } else {
5638       addq(dst, src);
5639     }
5640   }
5641 }
5642 
5643 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5644   assert (UseCompressedOops, "should only be used for compressed headers");
5645   assert (Universe::heap() != NULL, "java heap should be initialized");
5646   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5647   int oop_index = oop_recorder()->find_index(obj);
5648   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5649   mov_narrow_oop(dst, oop_index, rspec);
5650 }
5651 
5652 void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
5653   assert (UseCompressedOops, "should only be used for compressed headers");
5654   assert (Universe::heap() != NULL, "java heap should be initialized");
5655   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5656   int oop_index = oop_recorder()->find_index(obj);
5657   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5658   mov_narrow_oop(dst, oop_index, rspec);
5659 }
5660 
5661 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5662   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5663   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5664   int klass_index = oop_recorder()->find_index(k);
5665   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5666   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5667 }
5668 
5669 void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
5670   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5671   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5672   int klass_index = oop_recorder()->find_index(k);
5673   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5674   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5675 }
5676 
5677 void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
5678   assert (UseCompressedOops, "should only be used for compressed headers");
5679   assert (Universe::heap() != NULL, "java heap should be initialized");
5680   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5681   int oop_index = oop_recorder()->find_index(obj);
5682   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5683   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5684 }
5685 
5686 void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
5687   assert (UseCompressedOops, "should only be used for compressed headers");
5688   assert (Universe::heap() != NULL, "java heap should be initialized");
5689   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5690   int oop_index = oop_recorder()->find_index(obj);
5691   RelocationHolder rspec = oop_Relocation::spec(oop_index);
5692   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
5693 }
5694 
5695 void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
5696   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5697   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5698   int klass_index = oop_recorder()->find_index(k);
5699   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5700   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5701 }
5702 
5703 void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
5704   assert (UseCompressedClassPointers, "should only be used for compressed headers");
5705   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
5706   int klass_index = oop_recorder()->find_index(k);
5707   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
5708   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
5709 }
5710 
5711 void MacroAssembler::reinit_heapbase() {
5712   if (UseCompressedOops || UseCompressedClassPointers) {
5713     if (Universe::heap() != NULL) {
5714       if (CompressedOops::base() == NULL) {
5715         MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
5716       } else {
5717         mov64(r12_heapbase, (int64_t)CompressedOops::ptrs_base());
5718       }
5719     } else {
5720       movptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
5721     }
5722   }
5723 }
5724 
5725 #endif // _LP64
5726 
5727 // C2 compiled method's prolog code.
5728 void MacroAssembler::verified_entry(Compile* C, int sp_inc) {
5729   int framesize = C->frame_size_in_bytes();
5730   int bangsize = C->bang_size_in_bytes();
5731   bool fp_mode_24b = C->in_24_bit_fp_mode();
5732   int stack_bang_size = C->need_stack_bang(bangsize) ? bangsize : 0;
5733   bool is_stub = C->stub_function() != NULL;
5734 
5735   // WARNING: Initial instruction MUST be 5 bytes or longer so that
5736   // NativeJump::patch_verified_entry will be able to patch out the entry
5737   // code safely. The push to verify stack depth is ok at 5 bytes,
5738   // the frame allocation can be either 3 or 6 bytes. So if we don't do
5739   // stack bang then we must use the 6 byte frame allocation even if
5740   // we have no frame. :-(
5741   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
5742 
5743   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
5744   // Remove word for return addr
5745   framesize -= wordSize;
5746   stack_bang_size -= wordSize;
5747 
5748   // Calls to C2R adapters often do not accept exceptional returns.
5749   // We require that their callers must bang for them.  But be careful, because
5750   // some VM calls (such as call site linkage) can use several kilobytes of
5751   // stack.  But the stack safety zone should account for that.
5752   // See bugs 4446381, 4468289, 4497237.
5753   if (stack_bang_size > 0) {
5754     generate_stack_overflow_check(stack_bang_size);
5755 
5756     // We always push rbp, so that on return to interpreter rbp, will be
5757     // restored correctly and we can correct the stack.
5758     push(rbp);
5759     // Save caller's stack pointer into RBP if the frame pointer is preserved.
5760     if (PreserveFramePointer) {
5761       mov(rbp, rsp);
5762     }
5763     // Remove word for ebp
5764     framesize -= wordSize;
5765 
5766     // Create frame
5767     if (framesize) {
5768       subptr(rsp, framesize);
5769     }
5770   } else {
5771     // Create frame (force generation of a 4 byte immediate value)
5772     subptr_imm32(rsp, framesize);
5773 
5774     // Save RBP register now.
5775     framesize -= wordSize;
5776     movptr(Address(rsp, framesize), rbp);
5777     // Save caller's stack pointer into RBP if the frame pointer is preserved.
5778     if (PreserveFramePointer) {
5779       movptr(rbp, rsp);
5780       if (framesize > 0) {
5781         addptr(rbp, framesize);
5782       }
5783     }
5784   }
5785 
5786   if (C->needs_stack_repair()) {
5787     // Save stack increment (also account for fixed framesize and rbp)
5788     assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned");
5789     movptr(Address(rsp, C->sp_inc_offset()), sp_inc + framesize + wordSize);
5790   }
5791 
5792   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
5793     framesize -= wordSize;
5794     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
5795   }
5796 
5797 #ifndef _LP64
5798   // If method sets FPU control word do it now
5799   if (fp_mode_24b) {
5800     fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
5801   }
5802   if (UseSSE >= 2 && VerifyFPU) {
5803     verify_FPU(0, "FPU stack must be clean on entry");
5804   }
5805 #endif
5806 
5807 #ifdef ASSERT
5808   if (VerifyStackAtCalls) {
5809     Label L;
5810     push(rax);
5811     mov(rax, rsp);
5812     andptr(rax, StackAlignmentInBytes-1);
5813     cmpptr(rax, StackAlignmentInBytes-wordSize);
5814     pop(rax);
5815     jcc(Assembler::equal, L);
5816     STOP("Stack is not properly aligned!");
5817     bind(L);
5818   }
5819 #endif
5820 
5821   if (!is_stub) {
5822     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
5823     bs->nmethod_entry_barrier(this);
5824   }
5825 }
5826 
5827 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers
5828 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register val, XMMRegister xtmp) {
5829   // cnt - number of qwords (8-byte words).
5830   // base - start address, qword aligned.
5831   Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
5832   movdq(xtmp, val);
5833   if (UseAVX >= 2) {
5834     punpcklqdq(xtmp, xtmp);
5835     vinserti128_high(xtmp, xtmp);
5836   } else {
5837     punpcklqdq(xtmp, xtmp);
5838   }
5839   jmp(L_zero_64_bytes);
5840 
5841   BIND(L_loop);
5842   if (UseAVX >= 2) {
5843     vmovdqu(Address(base,  0), xtmp);
5844     vmovdqu(Address(base, 32), xtmp);
5845   } else {
5846     movdqu(Address(base,  0), xtmp);
5847     movdqu(Address(base, 16), xtmp);
5848     movdqu(Address(base, 32), xtmp);
5849     movdqu(Address(base, 48), xtmp);
5850   }
5851   addptr(base, 64);
5852 
5853   BIND(L_zero_64_bytes);
5854   subptr(cnt, 8);
5855   jccb(Assembler::greaterEqual, L_loop);
5856   addptr(cnt, 4);
5857   jccb(Assembler::less, L_tail);
5858   // Copy trailing 32 bytes
5859   if (UseAVX >= 2) {
5860     vmovdqu(Address(base, 0), xtmp);
5861   } else {
5862     movdqu(Address(base,  0), xtmp);
5863     movdqu(Address(base, 16), xtmp);
5864   }
5865   addptr(base, 32);
5866   subptr(cnt, 4);
5867 
5868   BIND(L_tail);
5869   addptr(cnt, 4);
5870   jccb(Assembler::lessEqual, L_end);
5871   decrement(cnt);
5872 
5873   BIND(L_sloop);
5874   movq(Address(base, 0), xtmp);
5875   addptr(base, 8);
5876   decrement(cnt);
5877   jccb(Assembler::greaterEqual, L_sloop);
5878   BIND(L_end);
5879 }
5880 
5881 void MacroAssembler::store_value_type_fields_to_buf(ciValueKlass* vk) {
5882 #ifndef _LP64
5883   super_call_VM_leaf(StubRoutines::store_value_type_fields_to_buf());
5884 #else
5885   // A value type might be returned. If fields are in registers we
5886   // need to allocate a value type instance and initialize it with
5887   // the value of the fields.
5888   Label skip, slow_case;
5889   // We only need a new buffered value if a new one is not returned
5890   testptr(rax, 1);
5891   jcc(Assembler::zero, skip);
5892 
5893   // Try to allocate a new buffered value (from the heap)
5894   if (UseTLAB) {
5895     // FIXME -- for smaller code, the inline allocation (and the slow case) should be moved inside the pack handler.
5896     if (vk != NULL) {
5897       // Called from C1, where the return type is statically known.
5898       movptr(rbx, (intptr_t)vk->get_ValueKlass());
5899       jint lh = vk->layout_helper();
5900       assert(lh != Klass::_lh_neutral_value, "inline class in return type must have been resolved");
5901       movl(r14, lh);
5902     } else {
5903       // Call from interpreter. RAX contains ((the ValueKlass* of the return type) | 0x01)
5904       mov(rbx, rax);
5905       andptr(rbx, -2);
5906       movl(r14, Address(rbx, Klass::layout_helper_offset()));
5907     }
5908 
5909     movptr(r13, Address(r15_thread, in_bytes(JavaThread::tlab_top_offset())));
5910     lea(r14, Address(r13, r14, Address::times_1));
5911     cmpptr(r14, Address(r15_thread, in_bytes(JavaThread::tlab_end_offset())));
5912     jcc(Assembler::above, slow_case);
5913     movptr(Address(r15_thread, in_bytes(JavaThread::tlab_top_offset())), r14);
5914     movptr(Address(r13, oopDesc::mark_offset_in_bytes()), (intptr_t)markOopDesc::always_locked_prototype());
5915 
5916     xorl(rax, rax); // use zero reg to clear memory (shorter code)
5917     store_klass_gap(r13, rax);  // zero klass gap for compressed oops
5918 
5919     if (vk == NULL) {
5920       // store_klass corrupts rbx, so save it in rax for later use (interpreter case only).
5921       mov(rax, rbx);
5922     }
5923     store_klass(r13, rbx);  // klass
5924 
5925     // We have our new buffered value, initialize its fields with a
5926     // value class specific handler
5927     if (vk != NULL) {
5928       // FIXME -- do the packing in-line to avoid the runtime call
5929       mov(rax, r13);
5930       call(RuntimeAddress(vk->pack_handler()));
5931     } else {
5932       movptr(rbx, Address(rax, InstanceKlass::adr_valueklass_fixed_block_offset()));
5933       movptr(rbx, Address(rbx, ValueKlass::pack_handler_offset()));
5934       mov(rax, r13);
5935       call(rbx);
5936     }
5937     jmp(skip);
5938   }
5939 
5940   bind(slow_case);
5941   // We failed to allocate a new value, fall back to a runtime
5942   // call. Some oop field may be live in some registers but we can't
5943   // tell. That runtime call will take care of preserving them
5944   // across a GC if there's one.
5945   super_call_VM_leaf(StubRoutines::store_value_type_fields_to_buf());
5946   bind(skip);
5947 #endif
5948 }
5949 
5950 
5951 // Move a value between registers/stack slots and update the reg_state
5952 bool MacroAssembler::move_helper(VMReg from, VMReg to, BasicType bt, RegState reg_state[], int ret_off, int extra_stack_offset) {
5953   if (reg_state[to->value()] == reg_written) {
5954     return true; // Already written
5955   }
5956   if (from != to && bt != T_VOID) {
5957     if (reg_state[to->value()] == reg_readonly) {
5958       return false; // Not yet writable
5959     }
5960     if (from->is_reg()) {
5961       if (to->is_reg()) {
5962         if (from->is_XMMRegister()) {
5963           if (bt == T_DOUBLE) {
5964             movdbl(to->as_XMMRegister(), from->as_XMMRegister());
5965           } else {
5966             assert(bt == T_FLOAT, "must be float");
5967             movflt(to->as_XMMRegister(), from->as_XMMRegister());
5968           }
5969         } else {
5970           movq(to->as_Register(), from->as_Register());
5971         }
5972       } else {
5973         int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset;
5974         assert(st_off != ret_off, "overwriting return address at %d", st_off);
5975         Address to_addr = Address(rsp, st_off);
5976         if (from->is_XMMRegister()) {
5977           if (bt == T_DOUBLE) {
5978             movdbl(to_addr, from->as_XMMRegister());
5979           } else {
5980             assert(bt == T_FLOAT, "must be float");
5981             movflt(to_addr, from->as_XMMRegister());
5982           }
5983         } else {
5984           movq(to_addr, from->as_Register());
5985         }
5986       }
5987     } else {
5988       Address from_addr = Address(rsp, from->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset);
5989       if (to->is_reg()) {
5990         if (to->is_XMMRegister()) {
5991           if (bt == T_DOUBLE) {
5992             movdbl(to->as_XMMRegister(), from_addr);
5993           } else {
5994             assert(bt == T_FLOAT, "must be float");
5995             movflt(to->as_XMMRegister(), from_addr);
5996           }
5997         } else {
5998           movq(to->as_Register(), from_addr);
5999         }
6000       } else {
6001         int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset;
6002         assert(st_off != ret_off, "overwriting return address at %d", st_off);
6003         movq(r13, from_addr);
6004         movq(Address(rsp, st_off), r13);
6005       }
6006     }
6007   }
6008   // Update register states
6009   reg_state[from->value()] = reg_writable;
6010   reg_state[to->value()] = reg_written;
6011   return true;
6012 }
6013 
6014 // Read all fields from a value type oop and store the values in registers/stack slots
6015 bool MacroAssembler::unpack_value_helper(const GrowableArray<SigEntry>* sig, int& sig_index, VMReg from, VMRegPair* regs_to,
6016                                          int& to_index, RegState reg_state[], int ret_off, int extra_stack_offset) {
6017   Register fromReg = from->is_reg() ? from->as_Register() : noreg;
6018   assert(sig->at(sig_index)._bt == T_VOID, "should be at end delimiter");
6019 
6020   int vt = 1;
6021   bool done = true;
6022   bool mark_done = true;
6023   do {
6024     sig_index--;
6025     BasicType bt = sig->at(sig_index)._bt;
6026     if (bt == T_VALUETYPE) {
6027       vt--;
6028     } else if (bt == T_VOID &&
6029                sig->at(sig_index-1)._bt != T_LONG &&
6030                sig->at(sig_index-1)._bt != T_DOUBLE) {
6031       vt++;
6032     } else if (SigEntry::is_reserved_entry(sig, sig_index)) {
6033       to_index--; // Ignore this
6034     } else {
6035       assert(to_index >= 0, "invalid to_index");
6036       VMRegPair pair_to = regs_to[to_index--];
6037       VMReg to = pair_to.first();
6038 
6039       if (bt == T_VOID) continue;
6040 
6041       int idx = (int)to->value();
6042       if (reg_state[idx] == reg_readonly) {
6043          if (idx != from->value()) {
6044            mark_done = false;
6045          }
6046          done = false;
6047          continue;
6048       } else if (reg_state[idx] == reg_written) {
6049         continue;
6050       } else {
6051         assert(reg_state[idx] == reg_writable, "must be writable");
6052         reg_state[idx] = reg_written;
6053        }
6054 
6055       if (fromReg == noreg) {
6056         int st_off = from->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset;
6057         movq(r10, Address(rsp, st_off));
6058         fromReg = r10;
6059       }
6060 
6061       int off = sig->at(sig_index)._offset;
6062       assert(off > 0, "offset in object should be positive");
6063       bool is_oop = (bt == T_OBJECT || bt == T_ARRAY);
6064 
6065       Address fromAddr = Address(fromReg, off);
6066       bool is_signed = (bt != T_CHAR) && (bt != T_BOOLEAN);
6067       if (!to->is_XMMRegister()) {
6068         Register dst = to->is_stack() ? r13 : to->as_Register();
6069         if (is_oop) {
6070           load_heap_oop(dst, fromAddr);
6071         } else {
6072           load_sized_value(dst, fromAddr, type2aelembytes(bt), is_signed);
6073         }
6074         if (to->is_stack()) {
6075           int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset;
6076           assert(st_off != ret_off, "overwriting return address at %d", st_off);
6077           movq(Address(rsp, st_off), dst);
6078         }
6079       } else {
6080         if (bt == T_DOUBLE) {
6081           movdbl(to->as_XMMRegister(), fromAddr);
6082         } else {
6083           assert(bt == T_FLOAT, "must be float");
6084           movflt(to->as_XMMRegister(), fromAddr);
6085         }
6086       }
6087     }
6088   } while (vt != 0);
6089   if (mark_done && reg_state[from->value()] != reg_written) {
6090     // This is okay because no one else will write to that slot
6091     reg_state[from->value()] = reg_writable;
6092   }
6093   return done;
6094 }
6095 
6096 class ScalarizedValueArgsStream : public StackObj {
6097   const GrowableArray<SigEntry>* _sig_cc;
6098   int _sig_cc_index;
6099   const VMRegPair* _regs_cc;
6100   int _regs_cc_count;
6101   int _regs_cc_index;
6102   int _vt;
6103   DEBUG_ONLY(bool _finished);
6104 public:
6105   ScalarizedValueArgsStream(const GrowableArray<SigEntry>* sig_cc, int sig_cc_index, VMRegPair* regs_cc, int regs_cc_count, int regs_cc_index) :
6106     _sig_cc(sig_cc), _sig_cc_index(sig_cc_index), _regs_cc(regs_cc), _regs_cc_count(regs_cc_count), _regs_cc_index(regs_cc_index) {
6107     assert(_sig_cc->at(_sig_cc_index)._bt == T_VALUETYPE, "should be at end delimiter");
6108     _vt = 1;
6109     DEBUG_ONLY(_finished = false);
6110   }
6111 
6112   bool next(VMRegPair& pair, BasicType& bt) {
6113     assert(!_finished, "sanity");
6114     do {
6115       _sig_cc_index++;
6116       bt = _sig_cc->at(_sig_cc_index)._bt;
6117       if (bt == T_VALUETYPE) {
6118         _vt++;
6119       } else if (bt == T_VOID &&
6120                  _sig_cc->at(_sig_cc_index-1)._bt != T_LONG &&
6121                  _sig_cc->at(_sig_cc_index-1)._bt != T_DOUBLE) {
6122         _vt--;
6123       } else if (SigEntry::is_reserved_entry(_sig_cc, _sig_cc_index)) {
6124         _regs_cc_index++;
6125       } else {
6126         assert(_regs_cc_index < _regs_cc_count, "must be");
6127         pair = _regs_cc[_regs_cc_index++];
6128         VMReg r1 = pair.first();
6129         VMReg r2 = pair.second();
6130 
6131         if (!r1->is_valid()) {
6132           assert(!r2->is_valid(), "must be invalid");
6133         } else {
6134           return true;
6135         }
6136       }
6137     } while (_vt != 0);
6138 
6139     DEBUG_ONLY(_finished = true);
6140     return false;
6141   }
6142 
6143   int sig_cc_index() {return _sig_cc_index;}
6144   int regs_cc_index() {return _regs_cc_index;}
6145 };
6146 
6147 static void skip_unpacked_fields(const GrowableArray<SigEntry>* sig, int& sig_index, VMRegPair* regs_from, int regs_from_count, int& from_index) {
6148   ScalarizedValueArgsStream stream(sig, sig_index, regs_from, regs_from_count, from_index);
6149   VMRegPair from_pair;
6150   BasicType bt;
6151   while (stream.next(from_pair, bt)) {}
6152   sig_index = stream.sig_cc_index();
6153   from_index = stream.regs_cc_index();
6154 }
6155 
6156 static bool is_reg_in_unpacked_fields(const GrowableArray<SigEntry>* sig, int sig_index, VMReg to, VMRegPair* regs_from, int regs_from_count, int from_index) {
6157   ScalarizedValueArgsStream stream(sig, sig_index, regs_from, regs_from_count, from_index);
6158   VMRegPair from_pair;
6159   BasicType bt;
6160   while (stream.next(from_pair, bt)) {
6161     if (from_pair.first() == to) {
6162       return true;
6163     }
6164   }
6165 
6166   return false;
6167 }
6168 
6169 // Pack fields back into a value type oop
6170 bool MacroAssembler::pack_value_helper(const GrowableArray<SigEntry>* sig, int& sig_index, int vtarg_index,
6171                                        VMReg to, VMRegPair* regs_from, int regs_from_count, int& from_index, RegState reg_state[],
6172                                        int ret_off, int extra_stack_offset) {
6173   assert(sig->at(sig_index)._bt == T_VALUETYPE, "should be at end delimiter");
6174   assert(to->is_valid(), "must be");
6175 
6176   if (reg_state[to->value()] == reg_written) {
6177     skip_unpacked_fields(sig, sig_index, regs_from, regs_from_count, from_index);
6178     return true; // Already written
6179   }
6180 
6181   Register val_array = rax;
6182   Register val_obj_tmp = r11;
6183   Register from_reg_tmp = r10;
6184   Register tmp1 = r14;
6185   Register tmp2 = r13;
6186   Register tmp3 = rbx;
6187   Register val_obj = to->is_stack() ? val_obj_tmp : to->as_Register();
6188 
6189   if (reg_state[to->value()] == reg_readonly) {
6190     if (!is_reg_in_unpacked_fields(sig, sig_index, to, regs_from, regs_from_count, from_index)) {
6191       skip_unpacked_fields(sig, sig_index, regs_from, regs_from_count, from_index);
6192       return false; // Not yet writable
6193     }
6194     val_obj = val_obj_tmp;
6195   }
6196 
6197   int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + vtarg_index * type2aelembytes(T_VALUETYPE);
6198   load_heap_oop(val_obj, Address(val_array, index));
6199 
6200   ScalarizedValueArgsStream stream(sig, sig_index, regs_from, regs_from_count, from_index);
6201   VMRegPair from_pair;
6202   BasicType bt;
6203   while (stream.next(from_pair, bt)) {
6204     int off = sig->at(stream.sig_cc_index())._offset;
6205     assert(off > 0, "offset in object should be positive");
6206     bool is_oop = (bt == T_OBJECT || bt == T_ARRAY);
6207     size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize;
6208 
6209     VMReg from_r1 = from_pair.first();
6210     VMReg from_r2 = from_pair.second();
6211 
6212     // Pack the scalarized field into the value object.
6213     Address dst(val_obj, off);
6214     if (!from_r1->is_XMMRegister()) {
6215       Register from_reg;
6216 
6217       if (from_r1->is_stack()) {
6218         from_reg = from_reg_tmp;
6219         int ld_off = from_r1->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset;
6220         load_sized_value(from_reg, Address(rsp, ld_off), size_in_bytes, /* is_signed */ false);
6221       } else {
6222         from_reg = from_r1->as_Register();
6223       }
6224 
6225       if (is_oop) {
6226         DecoratorSet decorators = IN_HEAP | ACCESS_WRITE;
6227         store_heap_oop(dst, from_reg, tmp1, tmp2, tmp3, decorators);
6228       } else {
6229         store_sized_value(dst, from_reg, size_in_bytes);
6230       }
6231     } else {
6232       if (from_r2->is_valid()) {
6233         movdbl(dst, from_r1->as_XMMRegister());
6234       } else {
6235         movflt(dst, from_r1->as_XMMRegister());
6236       }
6237     }
6238     reg_state[from_r1->value()] = reg_writable;
6239   }
6240   sig_index = stream.sig_cc_index();
6241   from_index = stream.regs_cc_index();
6242 
6243   assert(reg_state[to->value()] == reg_writable, "must have already been read");
6244   bool success = move_helper(val_obj->as_VMReg(), to, T_OBJECT, reg_state, ret_off, extra_stack_offset);
6245   assert(success, "to register must be writeable");
6246 
6247   return true;
6248 }
6249 
6250 // Unpack all value type arguments passed as oops
6251 void MacroAssembler::unpack_value_args(Compile* C, bool receiver_only) {
6252   assert(C->has_scalarized_args(), "value type argument scalarization is disabled");
6253   Method* method = C->method()->get_Method();
6254   const GrowableArray<SigEntry>* sig_cc = method->adapter()->get_sig_cc();
6255   assert(sig_cc != NULL, "must have scalarized signature");
6256 
6257   // Get unscalarized calling convention
6258   BasicType* sig_bt = NEW_RESOURCE_ARRAY(BasicType, sig_cc->length()); // FIXME - may underflow if we support values with no fields!
6259   int args_passed = 0;
6260   if (!method->is_static()) {
6261     sig_bt[args_passed++] = T_OBJECT;
6262   }
6263   if (!receiver_only) {
6264     for (SignatureStream ss(method->signature()); !ss.at_return_type(); ss.next()) {
6265       BasicType bt = ss.type();
6266       sig_bt[args_passed++] = bt;
6267       if (type2size[bt] == 2) {
6268         sig_bt[args_passed++] = T_VOID;
6269       }
6270     }
6271   } else {
6272     // Only unpack the receiver, all other arguments are already scalarized
6273     InstanceKlass* holder = method->method_holder();
6274     int rec_len = holder->is_value() ? ValueKlass::cast(holder)->extended_sig()->length() : 1;
6275     // Copy scalarized signature but skip receiver, value type delimiters and reserved entries
6276     for (int i = 0; i < sig_cc->length(); i++) {
6277       if (!SigEntry::is_reserved_entry(sig_cc, i)) {
6278         if (SigEntry::skip_value_delimiters(sig_cc, i) && rec_len <= 0) {
6279           sig_bt[args_passed++] = sig_cc->at(i)._bt;
6280         }
6281         rec_len--;
6282       }
6283     }
6284   }
6285   VMRegPair* regs = NEW_RESOURCE_ARRAY(VMRegPair, args_passed);
6286   int args_on_stack = SharedRuntime::java_calling_convention(sig_bt, regs, args_passed, false);
6287 
6288   // Get scalarized calling convention
6289   int args_passed_cc = SigEntry::fill_sig_bt(sig_cc, sig_bt);
6290   VMRegPair* regs_cc = NEW_RESOURCE_ARRAY(VMRegPair, sig_cc->length());
6291   int args_on_stack_cc = SharedRuntime::java_calling_convention(sig_bt, regs_cc, args_passed_cc, false);
6292 
6293   int extra_stack_offset = wordSize; // stack has the returned address
6294   int sp_inc = shuffle_value_args(false, receiver_only, extra_stack_offset, sig_bt, sig_cc,
6295                                   args_passed, args_on_stack, regs,
6296                                   args_passed_cc, args_on_stack_cc, regs_cc);
6297   // Emit code for verified entry and save increment for stack repair on return
6298   verified_entry(C, sp_inc);
6299 }
6300 
6301 static void mark_reg_writable(const VMRegPair* regs, int num_regs, int reg_index, MacroAssembler::RegState* reg_state) {
6302   assert(0 <= reg_index && reg_index < num_regs, "sanity");
6303   VMReg from_reg = regs[reg_index].first();
6304   if (from_reg->is_valid()) {
6305     assert(from_reg->is_stack(), "reserved entries must be stack");
6306     reg_state[from_reg->value()] = MacroAssembler::reg_writable;
6307   }
6308 }
6309 
6310 static void mark_reserved_entries_writable(const GrowableArray<SigEntry>* sig_cc, const VMRegPair* regs, int num_regs, MacroAssembler::RegState* reg_state) {
6311   int reg_index = 0;
6312   for (int sig_index = 0; sig_index <sig_cc->length(); sig_index ++) {
6313     if (SigEntry::is_reserved_entry(sig_cc, sig_index)) {
6314       mark_reg_writable(regs, num_regs, reg_index, reg_state);
6315       reg_index ++;
6316     } else if (SigEntry::skip_value_delimiters(sig_cc, sig_index)) {
6317       reg_index ++;
6318     } else {
6319       int vt = 1;
6320       do {
6321         sig_index++;
6322         BasicType bt = sig_cc->at(sig_index)._bt;
6323         if (bt == T_VALUETYPE) {
6324           vt++;
6325         } else if (bt == T_VOID &&
6326                    sig_cc->at(sig_index-1)._bt != T_LONG &&
6327                    sig_cc->at(sig_index-1)._bt != T_DOUBLE) {
6328           vt--;
6329         } else if (SigEntry::is_reserved_entry(sig_cc, sig_index)) {
6330           mark_reg_writable(regs, num_regs, reg_index, reg_state);
6331           reg_index++;
6332         } else {
6333           reg_index++;
6334         }
6335       } while (vt != 0);
6336     }
6337   }
6338 }
6339 
6340 static MacroAssembler::RegState* init_reg_state(bool is_packing, const GrowableArray<SigEntry>* sig_cc,
6341                                                 VMRegPair* regs, int num_regs, int sp_inc, int max_stack) {
6342   int max_reg = VMRegImpl::stack2reg(max_stack)->value();
6343   MacroAssembler::RegState* reg_state = NEW_RESOURCE_ARRAY(MacroAssembler::RegState, max_reg);
6344 
6345   // Make all writable
6346   for (int i = 0; i < max_reg; ++i) {
6347     reg_state[i] = MacroAssembler::reg_writable;
6348   }
6349   // Set all source registers/stack slots to readonly to prevent accidental overwriting
6350   for (int i = 0; i < num_regs; ++i) {
6351     VMReg reg = regs[i].first();
6352     if (!reg->is_valid()) continue;
6353     if (reg->is_stack()) {
6354       // Update source stack location by adding stack increment
6355       reg = VMRegImpl::stack2reg(reg->reg2stack() + sp_inc/VMRegImpl::stack_slot_size);
6356       regs[i] = reg;
6357     }
6358     assert(reg->value() >= 0 && reg->value() < max_reg, "reg value out of bounds");
6359     reg_state[reg->value()] = MacroAssembler::reg_readonly;
6360   }
6361   if (is_packing) {
6362     // The reserved entries are not used by the packed args, so make them writable
6363     mark_reserved_entries_writable(sig_cc, regs, num_regs, reg_state);
6364   }
6365 
6366   return reg_state;
6367 }
6368 
6369 int MacroAssembler::shuffle_value_args(bool is_packing, bool receiver_only, int extra_stack_offset,
6370                                        BasicType* sig_bt, const GrowableArray<SigEntry>* sig_cc,
6371                                        int args_passed, int args_on_stack, VMRegPair* regs,            // from
6372                                        int args_passed_to, int args_on_stack_to, VMRegPair* regs_to) { // to
6373   // Check if we need to extend the stack for unpacking
6374   int sp_inc = (args_on_stack_to - args_on_stack) * VMRegImpl::stack_slot_size;
6375   if (sp_inc > 0) {
6376     // Save the return address, adjust the stack (make sure it is properly
6377     // 16-byte aligned) and copy the return address to the new top of the stack.
6378     pop(r13);
6379     sp_inc = align_up(sp_inc, StackAlignmentInBytes);
6380     subptr(rsp, sp_inc);
6381     push(r13);
6382   } else {
6383     // The scalarized calling convention needs less stack space than the unscalarized one.
6384     // No need to extend the stack, the caller will take care of these adjustments.
6385     sp_inc = 0;
6386   }
6387 
6388   int ret_off; // make sure we don't overwrite the return address
6389   if (is_packing) {
6390     // For C1 code, the VVEP doesn't have reserved slots, so we store the returned address at
6391     // rsp[0] during shuffling.
6392     ret_off = 0;
6393   } else {
6394     // C2 code ensures that sp_inc is a reserved slot.
6395     ret_off = sp_inc;
6396   }
6397 
6398   int max_stack = MAX2(args_on_stack + sp_inc/VMRegImpl::stack_slot_size, args_on_stack_to);
6399   RegState* reg_state = init_reg_state(is_packing, sig_cc, regs, args_passed, sp_inc, max_stack);
6400 
6401   // Emit code for packing/unpacking value type arguments
6402   // We try multiple times and eventually start spilling to resolve (circular) dependencies
6403   bool done = false;
6404   for (int i = 0; i < 2*args_passed_to && !done; ++i) {
6405     done = true;
6406     bool spill = (i > args_passed_to); // Start spilling?
6407     // Iterate over all arguments (when unpacking, do in reverse)
6408     int step = is_packing ? 1 : -1;
6409     int from_index    = is_packing ? 0 : args_passed      - 1;
6410     int to_index      = is_packing ? 0 : args_passed_to   - 1;
6411     int sig_index     = is_packing ? 0 : sig_cc->length() - 1;
6412     int sig_index_end = is_packing ? sig_cc->length() : -1;
6413     int vtarg_index = 0;
6414     for (; sig_index != sig_index_end; sig_index += step) {
6415       assert(0 <= sig_index && sig_index < sig_cc->length(), "index out of bounds");
6416       if (SigEntry::is_reserved_entry(sig_cc, sig_index)) {
6417         if (is_packing) {
6418           from_index += step;
6419         } else {
6420           to_index += step;
6421         }
6422       } else {
6423         assert(0 <= from_index && from_index < args_passed, "index out of bounds");
6424         assert(0 <= to_index && to_index < args_passed_to, "index out of bounds");
6425         if (spill) {
6426           // This call returns true IFF we should keep trying to spill in this round.
6427           spill = shuffle_value_args_spill(is_packing, sig_cc, sig_index, regs, from_index, args_passed,
6428                                            reg_state, ret_off, extra_stack_offset);
6429         }
6430         BasicType bt = sig_cc->at(sig_index)._bt;
6431         if (SigEntry::skip_value_delimiters(sig_cc, sig_index)) {
6432           VMReg from_reg = regs[from_index].first();
6433           done &= move_helper(from_reg, regs_to[to_index].first(), bt, reg_state, ret_off, extra_stack_offset);
6434           to_index += step;
6435         } else if (is_packing || !receiver_only || (from_index == 0 && bt == T_VOID)) {
6436           if (is_packing) {
6437             VMReg reg_to = regs_to[to_index].first();
6438             done &= pack_value_helper(sig_cc, sig_index, vtarg_index, reg_to, regs, args_passed, from_index,
6439                                       reg_state, ret_off, extra_stack_offset);
6440             vtarg_index ++;
6441             to_index ++;
6442             continue; // from_index already adjusted
6443           } else {
6444             VMReg from_reg = regs[from_index].first();
6445             done &= unpack_value_helper(sig_cc, sig_index, from_reg, regs_to, to_index, reg_state, ret_off, extra_stack_offset);
6446           }
6447         } else {
6448           continue;
6449         }
6450         from_index += step;
6451       }
6452     }
6453   }
6454   guarantee(done, "Could not resolve circular dependency when shuffling value type arguments");
6455   return sp_inc;
6456 }
6457 
6458 bool MacroAssembler::shuffle_value_args_spill(bool is_packing, const GrowableArray<SigEntry>* sig_cc, int sig_cc_index,
6459                                               VMRegPair* regs_from, int from_index, int regs_from_count,
6460                                               RegState* reg_state, int ret_off, int extra_stack_offset) {
6461   VMReg reg;
6462 
6463   if (!is_packing || SigEntry::skip_value_delimiters(sig_cc, sig_cc_index)) {
6464     reg = regs_from[from_index].first();
6465     if (!reg->is_valid() || reg_state[reg->value()] != reg_readonly) {
6466       // Spilling this won't break circles
6467       return true;
6468     }
6469   } else {
6470     ScalarizedValueArgsStream stream(sig_cc, sig_cc_index, regs_from, regs_from_count, from_index);
6471     VMRegPair from_pair;
6472     BasicType bt;
6473     bool found = false;
6474     while (stream.next(from_pair, bt)) {
6475       reg = from_pair.first();
6476       assert(reg->is_valid(), "must be");
6477       if (reg_state[reg->value()] == reg_readonly) {
6478         found = true;
6479         break;
6480       }
6481     }
6482     if (!found) {
6483       // Spilling fields in this value arg won't break circles
6484       return true;
6485     }
6486   }
6487 
6488   // Spill argument to be able to write the source and resolve circular dependencies
6489   VMReg spill_reg = reg->is_XMMRegister() ? xmm8->as_VMReg() : r14->as_VMReg();
6490   if (reg_state[spill_reg->value()] == reg_readonly) {
6491     // We have already spilled (in previous round). The spilled register should be consumed by this round.
6492   } else {
6493     bool res = move_helper(reg, spill_reg, T_DOUBLE, reg_state, ret_off, extra_stack_offset);
6494     assert(res, "Spilling should not fail");
6495     // Set spill_reg as new source and update state
6496     reg = spill_reg;
6497     regs_from[from_index].set1(reg);
6498     reg_state[reg->value()] = reg_readonly;
6499   }
6500 
6501   return false; // Do not spill again in this round
6502 }
6503 
6504 // Restores the stack on return
6505 void MacroAssembler::restore_stack(Compile* C) {
6506   int framesize = C->frame_size_in_bytes();
6507   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
6508   // Remove word for return addr already pushed and RBP
6509   framesize -= 2*wordSize;
6510 
6511   if (C->needs_stack_repair()) {
6512     // Restore rbp and repair rsp by adding the stack increment
6513     movq(rbp, Address(rsp, framesize));
6514     addq(rsp, Address(rsp, C->sp_inc_offset()));
6515   } else {
6516     if (framesize > 0) {
6517       addq(rsp, framesize);
6518     }
6519     pop(rbp);
6520   }
6521 }
6522 
6523 void MacroAssembler::clear_mem(Register base, Register cnt, Register val, XMMRegister xtmp, bool is_large, bool word_copy_only) {
6524   // cnt - number of qwords (8-byte words).
6525   // base - start address, qword aligned.
6526   // is_large - if optimizers know cnt is larger than InitArrayShortSize
6527   assert(base==rdi, "base register must be edi for rep stos");
6528   assert(val==rax,   "tmp register must be eax for rep stos");
6529   assert(cnt==rcx,   "cnt register must be ecx for rep stos");
6530   assert(InitArrayShortSize % BytesPerLong == 0,
6531     "InitArrayShortSize should be the multiple of BytesPerLong");
6532 
6533   Label DONE;
6534 
6535   if (!is_large) {
6536     Label LOOP, LONG;
6537     cmpptr(cnt, InitArrayShortSize/BytesPerLong);
6538     jccb(Assembler::greater, LONG);
6539 
6540     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
6541 
6542     decrement(cnt);
6543     jccb(Assembler::negative, DONE); // Zero length
6544 
6545     // Use individual pointer-sized stores for small counts:
6546     BIND(LOOP);
6547     movptr(Address(base, cnt, Address::times_ptr), val);
6548     decrement(cnt);
6549     jccb(Assembler::greaterEqual, LOOP);
6550     jmpb(DONE);
6551 
6552     BIND(LONG);
6553   }
6554 
6555   // Use longer rep-prefixed ops for non-small counts:
6556   if (UseFastStosb && !word_copy_only) {
6557     shlptr(cnt, 3); // convert to number of bytes
6558     rep_stosb();
6559   } else if (UseXMMForObjInit) {
6560     xmm_clear_mem(base, cnt, val, xtmp);
6561   } else {
6562     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
6563     rep_stos();
6564   }
6565 
6566   BIND(DONE);
6567 }
6568 
6569 #ifdef COMPILER2
6570 
6571 // IndexOf for constant substrings with size >= 8 chars
6572 // which don't need to be loaded through stack.
6573 void MacroAssembler::string_indexofC8(Register str1, Register str2,
6574                                       Register cnt1, Register cnt2,
6575                                       int int_cnt2,  Register result,
6576                                       XMMRegister vec, Register tmp,
6577                                       int ae) {
6578   ShortBranchVerifier sbv(this);
6579   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
6580   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
6581 
6582   // This method uses the pcmpestri instruction with bound registers
6583   //   inputs:
6584   //     xmm - substring
6585   //     rax - substring length (elements count)
6586   //     mem - scanned string
6587   //     rdx - string length (elements count)
6588   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
6589   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
6590   //   outputs:
6591   //     rcx - matched index in string
6592   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6593   int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
6594   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
6595   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
6596   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
6597 
6598   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
6599         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
6600         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
6601 
6602   // Note, inline_string_indexOf() generates checks:
6603   // if (substr.count > string.count) return -1;
6604   // if (substr.count == 0) return 0;
6605   assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
6606 
6607   // Load substring.
6608   if (ae == StrIntrinsicNode::UL) {
6609     pmovzxbw(vec, Address(str2, 0));
6610   } else {
6611     movdqu(vec, Address(str2, 0));
6612   }
6613   movl(cnt2, int_cnt2);
6614   movptr(result, str1); // string addr
6615 
6616   if (int_cnt2 > stride) {
6617     jmpb(SCAN_TO_SUBSTR);
6618 
6619     // Reload substr for rescan, this code
6620     // is executed only for large substrings (> 8 chars)
6621     bind(RELOAD_SUBSTR);
6622     if (ae == StrIntrinsicNode::UL) {
6623       pmovzxbw(vec, Address(str2, 0));
6624     } else {
6625       movdqu(vec, Address(str2, 0));
6626     }
6627     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
6628 
6629     bind(RELOAD_STR);
6630     // We came here after the beginning of the substring was
6631     // matched but the rest of it was not so we need to search
6632     // again. Start from the next element after the previous match.
6633 
6634     // cnt2 is number of substring reminding elements and
6635     // cnt1 is number of string reminding elements when cmp failed.
6636     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
6637     subl(cnt1, cnt2);
6638     addl(cnt1, int_cnt2);
6639     movl(cnt2, int_cnt2); // Now restore cnt2
6640 
6641     decrementl(cnt1);     // Shift to next element
6642     cmpl(cnt1, cnt2);
6643     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6644 
6645     addptr(result, (1<<scale1));
6646 
6647   } // (int_cnt2 > 8)
6648 
6649   // Scan string for start of substr in 16-byte vectors
6650   bind(SCAN_TO_SUBSTR);
6651   pcmpestri(vec, Address(result, 0), mode);
6652   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
6653   subl(cnt1, stride);
6654   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
6655   cmpl(cnt1, cnt2);
6656   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6657   addptr(result, 16);
6658   jmpb(SCAN_TO_SUBSTR);
6659 
6660   // Found a potential substr
6661   bind(FOUND_CANDIDATE);
6662   // Matched whole vector if first element matched (tmp(rcx) == 0).
6663   if (int_cnt2 == stride) {
6664     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
6665   } else { // int_cnt2 > 8
6666     jccb(Assembler::overflow, FOUND_SUBSTR);
6667   }
6668   // After pcmpestri tmp(rcx) contains matched element index
6669   // Compute start addr of substr
6670   lea(result, Address(result, tmp, scale1));
6671 
6672   // Make sure string is still long enough
6673   subl(cnt1, tmp);
6674   cmpl(cnt1, cnt2);
6675   if (int_cnt2 == stride) {
6676     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
6677   } else { // int_cnt2 > 8
6678     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
6679   }
6680   // Left less then substring.
6681 
6682   bind(RET_NOT_FOUND);
6683   movl(result, -1);
6684   jmp(EXIT);
6685 
6686   if (int_cnt2 > stride) {
6687     // This code is optimized for the case when whole substring
6688     // is matched if its head is matched.
6689     bind(MATCH_SUBSTR_HEAD);
6690     pcmpestri(vec, Address(result, 0), mode);
6691     // Reload only string if does not match
6692     jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
6693 
6694     Label CONT_SCAN_SUBSTR;
6695     // Compare the rest of substring (> 8 chars).
6696     bind(FOUND_SUBSTR);
6697     // First 8 chars are already matched.
6698     negptr(cnt2);
6699     addptr(cnt2, stride);
6700 
6701     bind(SCAN_SUBSTR);
6702     subl(cnt1, stride);
6703     cmpl(cnt2, -stride); // Do not read beyond substring
6704     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
6705     // Back-up strings to avoid reading beyond substring:
6706     // cnt1 = cnt1 - cnt2 + 8
6707     addl(cnt1, cnt2); // cnt2 is negative
6708     addl(cnt1, stride);
6709     movl(cnt2, stride); negptr(cnt2);
6710     bind(CONT_SCAN_SUBSTR);
6711     if (int_cnt2 < (int)G) {
6712       int tail_off1 = int_cnt2<<scale1;
6713       int tail_off2 = int_cnt2<<scale2;
6714       if (ae == StrIntrinsicNode::UL) {
6715         pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
6716       } else {
6717         movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
6718       }
6719       pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
6720     } else {
6721       // calculate index in register to avoid integer overflow (int_cnt2*2)
6722       movl(tmp, int_cnt2);
6723       addptr(tmp, cnt2);
6724       if (ae == StrIntrinsicNode::UL) {
6725         pmovzxbw(vec, Address(str2, tmp, scale2, 0));
6726       } else {
6727         movdqu(vec, Address(str2, tmp, scale2, 0));
6728       }
6729       pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
6730     }
6731     // Need to reload strings pointers if not matched whole vector
6732     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
6733     addptr(cnt2, stride);
6734     jcc(Assembler::negative, SCAN_SUBSTR);
6735     // Fall through if found full substring
6736 
6737   } // (int_cnt2 > 8)
6738 
6739   bind(RET_FOUND);
6740   // Found result if we matched full small substring.
6741   // Compute substr offset
6742   subptr(result, str1);
6743   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
6744     shrl(result, 1); // index
6745   }
6746   bind(EXIT);
6747 
6748 } // string_indexofC8
6749 
6750 // Small strings are loaded through stack if they cross page boundary.
6751 void MacroAssembler::string_indexof(Register str1, Register str2,
6752                                     Register cnt1, Register cnt2,
6753                                     int int_cnt2,  Register result,
6754                                     XMMRegister vec, Register tmp,
6755                                     int ae) {
6756   ShortBranchVerifier sbv(this);
6757   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
6758   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
6759 
6760   //
6761   // int_cnt2 is length of small (< 8 chars) constant substring
6762   // or (-1) for non constant substring in which case its length
6763   // is in cnt2 register.
6764   //
6765   // Note, inline_string_indexOf() generates checks:
6766   // if (substr.count > string.count) return -1;
6767   // if (substr.count == 0) return 0;
6768   //
6769   int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
6770   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
6771   // This method uses the pcmpestri instruction with bound registers
6772   //   inputs:
6773   //     xmm - substring
6774   //     rax - substring length (elements count)
6775   //     mem - scanned string
6776   //     rdx - string length (elements count)
6777   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
6778   //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
6779   //   outputs:
6780   //     rcx - matched index in string
6781   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6782   int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
6783   Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
6784   Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
6785 
6786   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
6787         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
6788         FOUND_CANDIDATE;
6789 
6790   { //========================================================
6791     // We don't know where these strings are located
6792     // and we can't read beyond them. Load them through stack.
6793     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
6794 
6795     movptr(tmp, rsp); // save old SP
6796 
6797     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
6798       if (int_cnt2 == (1>>scale2)) { // One byte
6799         assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
6800         load_unsigned_byte(result, Address(str2, 0));
6801         movdl(vec, result); // move 32 bits
6802       } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
6803         // Not enough header space in 32-bit VM: 12+3 = 15.
6804         movl(result, Address(str2, -1));
6805         shrl(result, 8);
6806         movdl(vec, result); // move 32 bits
6807       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
6808         load_unsigned_short(result, Address(str2, 0));
6809         movdl(vec, result); // move 32 bits
6810       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
6811         movdl(vec, Address(str2, 0)); // move 32 bits
6812       } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
6813         movq(vec, Address(str2, 0));  // move 64 bits
6814       } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
6815         // Array header size is 12 bytes in 32-bit VM
6816         // + 6 bytes for 3 chars == 18 bytes,
6817         // enough space to load vec and shift.
6818         assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
6819         if (ae == StrIntrinsicNode::UL) {
6820           int tail_off = int_cnt2-8;
6821           pmovzxbw(vec, Address(str2, tail_off));
6822           psrldq(vec, -2*tail_off);
6823         }
6824         else {
6825           int tail_off = int_cnt2*(1<<scale2);
6826           movdqu(vec, Address(str2, tail_off-16));
6827           psrldq(vec, 16-tail_off);
6828         }
6829       }
6830     } else { // not constant substring
6831       cmpl(cnt2, stride);
6832       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
6833 
6834       // We can read beyond string if srt+16 does not cross page boundary
6835       // since heaps are aligned and mapped by pages.
6836       assert(os::vm_page_size() < (int)G, "default page should be small");
6837       movl(result, str2); // We need only low 32 bits
6838       andl(result, (os::vm_page_size()-1));
6839       cmpl(result, (os::vm_page_size()-16));
6840       jccb(Assembler::belowEqual, CHECK_STR);
6841 
6842       // Move small strings to stack to allow load 16 bytes into vec.
6843       subptr(rsp, 16);
6844       int stk_offset = wordSize-(1<<scale2);
6845       push(cnt2);
6846 
6847       bind(COPY_SUBSTR);
6848       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
6849         load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
6850         movb(Address(rsp, cnt2, scale2, stk_offset), result);
6851       } else if (ae == StrIntrinsicNode::UU) {
6852         load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
6853         movw(Address(rsp, cnt2, scale2, stk_offset), result);
6854       }
6855       decrement(cnt2);
6856       jccb(Assembler::notZero, COPY_SUBSTR);
6857 
6858       pop(cnt2);
6859       movptr(str2, rsp);  // New substring address
6860     } // non constant
6861 
6862     bind(CHECK_STR);
6863     cmpl(cnt1, stride);
6864     jccb(Assembler::aboveEqual, BIG_STRINGS);
6865 
6866     // Check cross page boundary.
6867     movl(result, str1); // We need only low 32 bits
6868     andl(result, (os::vm_page_size()-1));
6869     cmpl(result, (os::vm_page_size()-16));
6870     jccb(Assembler::belowEqual, BIG_STRINGS);
6871 
6872     subptr(rsp, 16);
6873     int stk_offset = -(1<<scale1);
6874     if (int_cnt2 < 0) { // not constant
6875       push(cnt2);
6876       stk_offset += wordSize;
6877     }
6878     movl(cnt2, cnt1);
6879 
6880     bind(COPY_STR);
6881     if (ae == StrIntrinsicNode::LL) {
6882       load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
6883       movb(Address(rsp, cnt2, scale1, stk_offset), result);
6884     } else {
6885       load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
6886       movw(Address(rsp, cnt2, scale1, stk_offset), result);
6887     }
6888     decrement(cnt2);
6889     jccb(Assembler::notZero, COPY_STR);
6890 
6891     if (int_cnt2 < 0) { // not constant
6892       pop(cnt2);
6893     }
6894     movptr(str1, rsp);  // New string address
6895 
6896     bind(BIG_STRINGS);
6897     // Load substring.
6898     if (int_cnt2 < 0) { // -1
6899       if (ae == StrIntrinsicNode::UL) {
6900         pmovzxbw(vec, Address(str2, 0));
6901       } else {
6902         movdqu(vec, Address(str2, 0));
6903       }
6904       push(cnt2);       // substr count
6905       push(str2);       // substr addr
6906       push(str1);       // string addr
6907     } else {
6908       // Small (< 8 chars) constant substrings are loaded already.
6909       movl(cnt2, int_cnt2);
6910     }
6911     push(tmp);  // original SP
6912 
6913   } // Finished loading
6914 
6915   //========================================================
6916   // Start search
6917   //
6918 
6919   movptr(result, str1); // string addr
6920 
6921   if (int_cnt2  < 0) {  // Only for non constant substring
6922     jmpb(SCAN_TO_SUBSTR);
6923 
6924     // SP saved at sp+0
6925     // String saved at sp+1*wordSize
6926     // Substr saved at sp+2*wordSize
6927     // Substr count saved at sp+3*wordSize
6928 
6929     // Reload substr for rescan, this code
6930     // is executed only for large substrings (> 8 chars)
6931     bind(RELOAD_SUBSTR);
6932     movptr(str2, Address(rsp, 2*wordSize));
6933     movl(cnt2, Address(rsp, 3*wordSize));
6934     if (ae == StrIntrinsicNode::UL) {
6935       pmovzxbw(vec, Address(str2, 0));
6936     } else {
6937       movdqu(vec, Address(str2, 0));
6938     }
6939     // We came here after the beginning of the substring was
6940     // matched but the rest of it was not so we need to search
6941     // again. Start from the next element after the previous match.
6942     subptr(str1, result); // Restore counter
6943     if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
6944       shrl(str1, 1);
6945     }
6946     addl(cnt1, str1);
6947     decrementl(cnt1);   // Shift to next element
6948     cmpl(cnt1, cnt2);
6949     jcc(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6950 
6951     addptr(result, (1<<scale1));
6952   } // non constant
6953 
6954   // Scan string for start of substr in 16-byte vectors
6955   bind(SCAN_TO_SUBSTR);
6956   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6957   pcmpestri(vec, Address(result, 0), mode);
6958   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
6959   subl(cnt1, stride);
6960   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
6961   cmpl(cnt1, cnt2);
6962   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
6963   addptr(result, 16);
6964 
6965   bind(ADJUST_STR);
6966   cmpl(cnt1, stride); // Do not read beyond string
6967   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
6968   // Back-up string to avoid reading beyond string.
6969   lea(result, Address(result, cnt1, scale1, -16));
6970   movl(cnt1, stride);
6971   jmpb(SCAN_TO_SUBSTR);
6972 
6973   // Found a potential substr
6974   bind(FOUND_CANDIDATE);
6975   // After pcmpestri tmp(rcx) contains matched element index
6976 
6977   // Make sure string is still long enough
6978   subl(cnt1, tmp);
6979   cmpl(cnt1, cnt2);
6980   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
6981   // Left less then substring.
6982 
6983   bind(RET_NOT_FOUND);
6984   movl(result, -1);
6985   jmp(CLEANUP);
6986 
6987   bind(FOUND_SUBSTR);
6988   // Compute start addr of substr
6989   lea(result, Address(result, tmp, scale1));
6990   if (int_cnt2 > 0) { // Constant substring
6991     // Repeat search for small substring (< 8 chars)
6992     // from new point without reloading substring.
6993     // Have to check that we don't read beyond string.
6994     cmpl(tmp, stride-int_cnt2);
6995     jccb(Assembler::greater, ADJUST_STR);
6996     // Fall through if matched whole substring.
6997   } else { // non constant
6998     assert(int_cnt2 == -1, "should be != 0");
6999 
7000     addl(tmp, cnt2);
7001     // Found result if we matched whole substring.
7002     cmpl(tmp, stride);
7003     jcc(Assembler::lessEqual, RET_FOUND);
7004 
7005     // Repeat search for small substring (<= 8 chars)
7006     // from new point 'str1' without reloading substring.
7007     cmpl(cnt2, stride);
7008     // Have to check that we don't read beyond string.
7009     jccb(Assembler::lessEqual, ADJUST_STR);
7010 
7011     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
7012     // Compare the rest of substring (> 8 chars).
7013     movptr(str1, result);
7014 
7015     cmpl(tmp, cnt2);
7016     // First 8 chars are already matched.
7017     jccb(Assembler::equal, CHECK_NEXT);
7018 
7019     bind(SCAN_SUBSTR);
7020     pcmpestri(vec, Address(str1, 0), mode);
7021     // Need to reload strings pointers if not matched whole vector
7022     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
7023 
7024     bind(CHECK_NEXT);
7025     subl(cnt2, stride);
7026     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
7027     addptr(str1, 16);
7028     if (ae == StrIntrinsicNode::UL) {
7029       addptr(str2, 8);
7030     } else {
7031       addptr(str2, 16);
7032     }
7033     subl(cnt1, stride);
7034     cmpl(cnt2, stride); // Do not read beyond substring
7035     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
7036     // Back-up strings to avoid reading beyond substring.
7037 
7038     if (ae == StrIntrinsicNode::UL) {
7039       lea(str2, Address(str2, cnt2, scale2, -8));
7040       lea(str1, Address(str1, cnt2, scale1, -16));
7041     } else {
7042       lea(str2, Address(str2, cnt2, scale2, -16));
7043       lea(str1, Address(str1, cnt2, scale1, -16));
7044     }
7045     subl(cnt1, cnt2);
7046     movl(cnt2, stride);
7047     addl(cnt1, stride);
7048     bind(CONT_SCAN_SUBSTR);
7049     if (ae == StrIntrinsicNode::UL) {
7050       pmovzxbw(vec, Address(str2, 0));
7051     } else {
7052       movdqu(vec, Address(str2, 0));
7053     }
7054     jmp(SCAN_SUBSTR);
7055 
7056     bind(RET_FOUND_LONG);
7057     movptr(str1, Address(rsp, wordSize));
7058   } // non constant
7059 
7060   bind(RET_FOUND);
7061   // Compute substr offset
7062   subptr(result, str1);
7063   if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
7064     shrl(result, 1); // index
7065   }
7066   bind(CLEANUP);
7067   pop(rsp); // restore SP
7068 
7069 } // string_indexof
7070 
7071 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
7072                                          XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
7073   ShortBranchVerifier sbv(this);
7074   assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
7075 
7076   int stride = 8;
7077 
7078   Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
7079         SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
7080         RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
7081         FOUND_SEQ_CHAR, DONE_LABEL;
7082 
7083   movptr(result, str1);
7084   if (UseAVX >= 2) {
7085     cmpl(cnt1, stride);
7086     jcc(Assembler::less, SCAN_TO_CHAR_LOOP);
7087     cmpl(cnt1, 2*stride);
7088     jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
7089     movdl(vec1, ch);
7090     vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
7091     vpxor(vec2, vec2);
7092     movl(tmp, cnt1);
7093     andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
7094     andl(cnt1,0x0000000F);  //tail count (in chars)
7095 
7096     bind(SCAN_TO_16_CHAR_LOOP);
7097     vmovdqu(vec3, Address(result, 0));
7098     vpcmpeqw(vec3, vec3, vec1, 1);
7099     vptest(vec2, vec3);
7100     jcc(Assembler::carryClear, FOUND_CHAR);
7101     addptr(result, 32);
7102     subl(tmp, 2*stride);
7103     jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
7104     jmp(SCAN_TO_8_CHAR);
7105     bind(SCAN_TO_8_CHAR_INIT);
7106     movdl(vec1, ch);
7107     pshuflw(vec1, vec1, 0x00);
7108     pshufd(vec1, vec1, 0);
7109     pxor(vec2, vec2);
7110   }
7111   bind(SCAN_TO_8_CHAR);
7112   cmpl(cnt1, stride);
7113   if (UseAVX >= 2) {
7114     jcc(Assembler::less, SCAN_TO_CHAR);
7115   } else {
7116     jcc(Assembler::less, SCAN_TO_CHAR_LOOP);
7117     movdl(vec1, ch);
7118     pshuflw(vec1, vec1, 0x00);
7119     pshufd(vec1, vec1, 0);
7120     pxor(vec2, vec2);
7121   }
7122   movl(tmp, cnt1);
7123   andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
7124   andl(cnt1,0x00000007);  //tail count (in chars)
7125 
7126   bind(SCAN_TO_8_CHAR_LOOP);
7127   movdqu(vec3, Address(result, 0));
7128   pcmpeqw(vec3, vec1);
7129   ptest(vec2, vec3);
7130   jcc(Assembler::carryClear, FOUND_CHAR);
7131   addptr(result, 16);
7132   subl(tmp, stride);
7133   jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
7134   bind(SCAN_TO_CHAR);
7135   testl(cnt1, cnt1);
7136   jcc(Assembler::zero, RET_NOT_FOUND);
7137   bind(SCAN_TO_CHAR_LOOP);
7138   load_unsigned_short(tmp, Address(result, 0));
7139   cmpl(ch, tmp);
7140   jccb(Assembler::equal, FOUND_SEQ_CHAR);
7141   addptr(result, 2);
7142   subl(cnt1, 1);
7143   jccb(Assembler::zero, RET_NOT_FOUND);
7144   jmp(SCAN_TO_CHAR_LOOP);
7145 
7146   bind(RET_NOT_FOUND);
7147   movl(result, -1);
7148   jmpb(DONE_LABEL);
7149 
7150   bind(FOUND_CHAR);
7151   if (UseAVX >= 2) {
7152     vpmovmskb(tmp, vec3);
7153   } else {
7154     pmovmskb(tmp, vec3);
7155   }
7156   bsfl(ch, tmp);
7157   addl(result, ch);
7158 
7159   bind(FOUND_SEQ_CHAR);
7160   subptr(result, str1);
7161   shrl(result, 1);
7162 
7163   bind(DONE_LABEL);
7164 } // string_indexof_char
7165 
7166 // helper function for string_compare
7167 void MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
7168                                         Address::ScaleFactor scale, Address::ScaleFactor scale1,
7169                                         Address::ScaleFactor scale2, Register index, int ae) {
7170   if (ae == StrIntrinsicNode::LL) {
7171     load_unsigned_byte(elem1, Address(str1, index, scale, 0));
7172     load_unsigned_byte(elem2, Address(str2, index, scale, 0));
7173   } else if (ae == StrIntrinsicNode::UU) {
7174     load_unsigned_short(elem1, Address(str1, index, scale, 0));
7175     load_unsigned_short(elem2, Address(str2, index, scale, 0));
7176   } else {
7177     load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
7178     load_unsigned_short(elem2, Address(str2, index, scale2, 0));
7179   }
7180 }
7181 
7182 // Compare strings, used for char[] and byte[].
7183 void MacroAssembler::string_compare(Register str1, Register str2,
7184                                     Register cnt1, Register cnt2, Register result,
7185                                     XMMRegister vec1, int ae) {
7186   ShortBranchVerifier sbv(this);
7187   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
7188   Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
7189   int stride, stride2, adr_stride, adr_stride1, adr_stride2;
7190   int stride2x2 = 0x40;
7191   Address::ScaleFactor scale = Address::no_scale;
7192   Address::ScaleFactor scale1 = Address::no_scale;
7193   Address::ScaleFactor scale2 = Address::no_scale;
7194 
7195   if (ae != StrIntrinsicNode::LL) {
7196     stride2x2 = 0x20;
7197   }
7198 
7199   if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
7200     shrl(cnt2, 1);
7201   }
7202   // Compute the minimum of the string lengths and the
7203   // difference of the string lengths (stack).
7204   // Do the conditional move stuff
7205   movl(result, cnt1);
7206   subl(cnt1, cnt2);
7207   push(cnt1);
7208   cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
7209 
7210   // Is the minimum length zero?
7211   testl(cnt2, cnt2);
7212   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
7213   if (ae == StrIntrinsicNode::LL) {
7214     // Load first bytes
7215     load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
7216     load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
7217   } else if (ae == StrIntrinsicNode::UU) {
7218     // Load first characters
7219     load_unsigned_short(result, Address(str1, 0));
7220     load_unsigned_short(cnt1, Address(str2, 0));
7221   } else {
7222     load_unsigned_byte(result, Address(str1, 0));
7223     load_unsigned_short(cnt1, Address(str2, 0));
7224   }
7225   subl(result, cnt1);
7226   jcc(Assembler::notZero,  POP_LABEL);
7227 
7228   if (ae == StrIntrinsicNode::UU) {
7229     // Divide length by 2 to get number of chars
7230     shrl(cnt2, 1);
7231   }
7232   cmpl(cnt2, 1);
7233   jcc(Assembler::equal, LENGTH_DIFF_LABEL);
7234 
7235   // Check if the strings start at the same location and setup scale and stride
7236   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7237     cmpptr(str1, str2);
7238     jcc(Assembler::equal, LENGTH_DIFF_LABEL);
7239     if (ae == StrIntrinsicNode::LL) {
7240       scale = Address::times_1;
7241       stride = 16;
7242     } else {
7243       scale = Address::times_2;
7244       stride = 8;
7245     }
7246   } else {
7247     scale1 = Address::times_1;
7248     scale2 = Address::times_2;
7249     // scale not used
7250     stride = 8;
7251   }
7252 
7253   if (UseAVX >= 2 && UseSSE42Intrinsics) {
7254     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
7255     Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
7256     Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
7257     Label COMPARE_TAIL_LONG;
7258     Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
7259 
7260     int pcmpmask = 0x19;
7261     if (ae == StrIntrinsicNode::LL) {
7262       pcmpmask &= ~0x01;
7263     }
7264 
7265     // Setup to compare 16-chars (32-bytes) vectors,
7266     // start from first character again because it has aligned address.
7267     if (ae == StrIntrinsicNode::LL) {
7268       stride2 = 32;
7269     } else {
7270       stride2 = 16;
7271     }
7272     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7273       adr_stride = stride << scale;
7274     } else {
7275       adr_stride1 = 8;  //stride << scale1;
7276       adr_stride2 = 16; //stride << scale2;
7277     }
7278 
7279     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
7280     // rax and rdx are used by pcmpestri as elements counters
7281     movl(result, cnt2);
7282     andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
7283     jcc(Assembler::zero, COMPARE_TAIL_LONG);
7284 
7285     // fast path : compare first 2 8-char vectors.
7286     bind(COMPARE_16_CHARS);
7287     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7288       movdqu(vec1, Address(str1, 0));
7289     } else {
7290       pmovzxbw(vec1, Address(str1, 0));
7291     }
7292     pcmpestri(vec1, Address(str2, 0), pcmpmask);
7293     jccb(Assembler::below, COMPARE_INDEX_CHAR);
7294 
7295     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7296       movdqu(vec1, Address(str1, adr_stride));
7297       pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
7298     } else {
7299       pmovzxbw(vec1, Address(str1, adr_stride1));
7300       pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
7301     }
7302     jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
7303     addl(cnt1, stride);
7304 
7305     // Compare the characters at index in cnt1
7306     bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
7307     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
7308     subl(result, cnt2);
7309     jmp(POP_LABEL);
7310 
7311     // Setup the registers to start vector comparison loop
7312     bind(COMPARE_WIDE_VECTORS);
7313     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7314       lea(str1, Address(str1, result, scale));
7315       lea(str2, Address(str2, result, scale));
7316     } else {
7317       lea(str1, Address(str1, result, scale1));
7318       lea(str2, Address(str2, result, scale2));
7319     }
7320     subl(result, stride2);
7321     subl(cnt2, stride2);
7322     jcc(Assembler::zero, COMPARE_WIDE_TAIL);
7323     negptr(result);
7324 
7325     //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
7326     bind(COMPARE_WIDE_VECTORS_LOOP);
7327 
7328 #ifdef _LP64
7329     if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
7330       cmpl(cnt2, stride2x2);
7331       jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
7332       testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
7333       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
7334 
7335       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
7336       if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7337         evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
7338         evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
7339       } else {
7340         vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
7341         evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
7342       }
7343       kortestql(k7, k7);
7344       jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
7345       addptr(result, stride2x2);  // update since we already compared at this addr
7346       subl(cnt2, stride2x2);      // and sub the size too
7347       jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
7348 
7349       vpxor(vec1, vec1);
7350       jmpb(COMPARE_WIDE_TAIL);
7351     }//if (VM_Version::supports_avx512vlbw())
7352 #endif // _LP64
7353 
7354 
7355     bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
7356     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7357       vmovdqu(vec1, Address(str1, result, scale));
7358       vpxor(vec1, Address(str2, result, scale));
7359     } else {
7360       vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
7361       vpxor(vec1, Address(str2, result, scale2));
7362     }
7363     vptest(vec1, vec1);
7364     jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
7365     addptr(result, stride2);
7366     subl(cnt2, stride2);
7367     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
7368     // clean upper bits of YMM registers
7369     vpxor(vec1, vec1);
7370 
7371     // compare wide vectors tail
7372     bind(COMPARE_WIDE_TAIL);
7373     testptr(result, result);
7374     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
7375 
7376     movl(result, stride2);
7377     movl(cnt2, result);
7378     negptr(result);
7379     jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
7380 
7381     // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
7382     bind(VECTOR_NOT_EQUAL);
7383     // clean upper bits of YMM registers
7384     vpxor(vec1, vec1);
7385     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7386       lea(str1, Address(str1, result, scale));
7387       lea(str2, Address(str2, result, scale));
7388     } else {
7389       lea(str1, Address(str1, result, scale1));
7390       lea(str2, Address(str2, result, scale2));
7391     }
7392     jmp(COMPARE_16_CHARS);
7393 
7394     // Compare tail chars, length between 1 to 15 chars
7395     bind(COMPARE_TAIL_LONG);
7396     movl(cnt2, result);
7397     cmpl(cnt2, stride);
7398     jcc(Assembler::less, COMPARE_SMALL_STR);
7399 
7400     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7401       movdqu(vec1, Address(str1, 0));
7402     } else {
7403       pmovzxbw(vec1, Address(str1, 0));
7404     }
7405     pcmpestri(vec1, Address(str2, 0), pcmpmask);
7406     jcc(Assembler::below, COMPARE_INDEX_CHAR);
7407     subptr(cnt2, stride);
7408     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
7409     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7410       lea(str1, Address(str1, result, scale));
7411       lea(str2, Address(str2, result, scale));
7412     } else {
7413       lea(str1, Address(str1, result, scale1));
7414       lea(str2, Address(str2, result, scale2));
7415     }
7416     negptr(cnt2);
7417     jmpb(WHILE_HEAD_LABEL);
7418 
7419     bind(COMPARE_SMALL_STR);
7420   } else if (UseSSE42Intrinsics) {
7421     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
7422     int pcmpmask = 0x19;
7423     // Setup to compare 8-char (16-byte) vectors,
7424     // start from first character again because it has aligned address.
7425     movl(result, cnt2);
7426     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
7427     if (ae == StrIntrinsicNode::LL) {
7428       pcmpmask &= ~0x01;
7429     }
7430     jcc(Assembler::zero, COMPARE_TAIL);
7431     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7432       lea(str1, Address(str1, result, scale));
7433       lea(str2, Address(str2, result, scale));
7434     } else {
7435       lea(str1, Address(str1, result, scale1));
7436       lea(str2, Address(str2, result, scale2));
7437     }
7438     negptr(result);
7439 
7440     // pcmpestri
7441     //   inputs:
7442     //     vec1- substring
7443     //     rax - negative string length (elements count)
7444     //     mem - scanned string
7445     //     rdx - string length (elements count)
7446     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
7447     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
7448     //   outputs:
7449     //     rcx - first mismatched element index
7450     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
7451 
7452     bind(COMPARE_WIDE_VECTORS);
7453     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7454       movdqu(vec1, Address(str1, result, scale));
7455       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
7456     } else {
7457       pmovzxbw(vec1, Address(str1, result, scale1));
7458       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
7459     }
7460     // After pcmpestri cnt1(rcx) contains mismatched element index
7461 
7462     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
7463     addptr(result, stride);
7464     subptr(cnt2, stride);
7465     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
7466 
7467     // compare wide vectors tail
7468     testptr(result, result);
7469     jcc(Assembler::zero, LENGTH_DIFF_LABEL);
7470 
7471     movl(cnt2, stride);
7472     movl(result, stride);
7473     negptr(result);
7474     if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7475       movdqu(vec1, Address(str1, result, scale));
7476       pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
7477     } else {
7478       pmovzxbw(vec1, Address(str1, result, scale1));
7479       pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
7480     }
7481     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
7482 
7483     // Mismatched characters in the vectors
7484     bind(VECTOR_NOT_EQUAL);
7485     addptr(cnt1, result);
7486     load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
7487     subl(result, cnt2);
7488     jmpb(POP_LABEL);
7489 
7490     bind(COMPARE_TAIL); // limit is zero
7491     movl(cnt2, result);
7492     // Fallthru to tail compare
7493   }
7494   // Shift str2 and str1 to the end of the arrays, negate min
7495   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7496     lea(str1, Address(str1, cnt2, scale));
7497     lea(str2, Address(str2, cnt2, scale));
7498   } else {
7499     lea(str1, Address(str1, cnt2, scale1));
7500     lea(str2, Address(str2, cnt2, scale2));
7501   }
7502   decrementl(cnt2);  // first character was compared already
7503   negptr(cnt2);
7504 
7505   // Compare the rest of the elements
7506   bind(WHILE_HEAD_LABEL);
7507   load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
7508   subl(result, cnt1);
7509   jccb(Assembler::notZero, POP_LABEL);
7510   increment(cnt2);
7511   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
7512 
7513   // Strings are equal up to min length.  Return the length difference.
7514   bind(LENGTH_DIFF_LABEL);
7515   pop(result);
7516   if (ae == StrIntrinsicNode::UU) {
7517     // Divide diff by 2 to get number of chars
7518     sarl(result, 1);
7519   }
7520   jmpb(DONE_LABEL);
7521 
7522 #ifdef _LP64
7523   if (VM_Version::supports_avx512vlbw()) {
7524 
7525     bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
7526 
7527     kmovql(cnt1, k7);
7528     notq(cnt1);
7529     bsfq(cnt2, cnt1);
7530     if (ae != StrIntrinsicNode::LL) {
7531       // Divide diff by 2 to get number of chars
7532       sarl(cnt2, 1);
7533     }
7534     addq(result, cnt2);
7535     if (ae == StrIntrinsicNode::LL) {
7536       load_unsigned_byte(cnt1, Address(str2, result));
7537       load_unsigned_byte(result, Address(str1, result));
7538     } else if (ae == StrIntrinsicNode::UU) {
7539       load_unsigned_short(cnt1, Address(str2, result, scale));
7540       load_unsigned_short(result, Address(str1, result, scale));
7541     } else {
7542       load_unsigned_short(cnt1, Address(str2, result, scale2));
7543       load_unsigned_byte(result, Address(str1, result, scale1));
7544     }
7545     subl(result, cnt1);
7546     jmpb(POP_LABEL);
7547   }//if (VM_Version::supports_avx512vlbw())
7548 #endif // _LP64
7549 
7550   // Discard the stored length difference
7551   bind(POP_LABEL);
7552   pop(cnt1);
7553 
7554   // That's it
7555   bind(DONE_LABEL);
7556   if(ae == StrIntrinsicNode::UL) {
7557     negl(result);
7558   }
7559 
7560 }
7561 
7562 // Search for Non-ASCII character (Negative byte value) in a byte array,
7563 // return true if it has any and false otherwise.
7564 //   ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
7565 //   @HotSpotIntrinsicCandidate
7566 //   private static boolean hasNegatives(byte[] ba, int off, int len) {
7567 //     for (int i = off; i < off + len; i++) {
7568 //       if (ba[i] < 0) {
7569 //         return true;
7570 //       }
7571 //     }
7572 //     return false;
7573 //   }
7574 void MacroAssembler::has_negatives(Register ary1, Register len,
7575   Register result, Register tmp1,
7576   XMMRegister vec1, XMMRegister vec2) {
7577   // rsi: byte array
7578   // rcx: len
7579   // rax: result
7580   ShortBranchVerifier sbv(this);
7581   assert_different_registers(ary1, len, result, tmp1);
7582   assert_different_registers(vec1, vec2);
7583   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
7584 
7585   // len == 0
7586   testl(len, len);
7587   jcc(Assembler::zero, FALSE_LABEL);
7588 
7589   if ((UseAVX > 2) && // AVX512
7590     VM_Version::supports_avx512vlbw() &&
7591     VM_Version::supports_bmi2()) {
7592 
7593     Label test_64_loop, test_tail;
7594     Register tmp3_aliased = len;
7595 
7596     movl(tmp1, len);
7597     vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
7598 
7599     andl(tmp1, 64 - 1);   // tail count (in chars) 0x3F
7600     andl(len, ~(64 - 1));    // vector count (in chars)
7601     jccb(Assembler::zero, test_tail);
7602 
7603     lea(ary1, Address(ary1, len, Address::times_1));
7604     negptr(len);
7605 
7606     bind(test_64_loop);
7607     // Check whether our 64 elements of size byte contain negatives
7608     evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
7609     kortestql(k2, k2);
7610     jcc(Assembler::notZero, TRUE_LABEL);
7611 
7612     addptr(len, 64);
7613     jccb(Assembler::notZero, test_64_loop);
7614 
7615 
7616     bind(test_tail);
7617     // bail out when there is nothing to be done
7618     testl(tmp1, -1);
7619     jcc(Assembler::zero, FALSE_LABEL);
7620 
7621     // ~(~0 << len) applied up to two times (for 32-bit scenario)
7622 #ifdef _LP64
7623     mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
7624     shlxq(tmp3_aliased, tmp3_aliased, tmp1);
7625     notq(tmp3_aliased);
7626     kmovql(k3, tmp3_aliased);
7627 #else
7628     Label k_init;
7629     jmp(k_init);
7630 
7631     // We could not read 64-bits from a general purpose register thus we move
7632     // data required to compose 64 1's to the instruction stream
7633     // We emit 64 byte wide series of elements from 0..63 which later on would
7634     // be used as a compare targets with tail count contained in tmp1 register.
7635     // Result would be a k register having tmp1 consecutive number or 1
7636     // counting from least significant bit.
7637     address tmp = pc();
7638     emit_int64(0x0706050403020100);
7639     emit_int64(0x0F0E0D0C0B0A0908);
7640     emit_int64(0x1716151413121110);
7641     emit_int64(0x1F1E1D1C1B1A1918);
7642     emit_int64(0x2726252423222120);
7643     emit_int64(0x2F2E2D2C2B2A2928);
7644     emit_int64(0x3736353433323130);
7645     emit_int64(0x3F3E3D3C3B3A3938);
7646 
7647     bind(k_init);
7648     lea(len, InternalAddress(tmp));
7649     // create mask to test for negative byte inside a vector
7650     evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
7651     evpcmpgtb(k3, vec1, Address(len, 0), Assembler::AVX_512bit);
7652 
7653 #endif
7654     evpcmpgtb(k2, k3, vec2, Address(ary1, 0), Assembler::AVX_512bit);
7655     ktestq(k2, k3);
7656     jcc(Assembler::notZero, TRUE_LABEL);
7657 
7658     jmp(FALSE_LABEL);
7659   } else {
7660     movl(result, len); // copy
7661 
7662     if (UseAVX == 2 && UseSSE >= 2) {
7663       // With AVX2, use 32-byte vector compare
7664       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7665 
7666       // Compare 32-byte vectors
7667       andl(result, 0x0000001f);  //   tail count (in bytes)
7668       andl(len, 0xffffffe0);   // vector count (in bytes)
7669       jccb(Assembler::zero, COMPARE_TAIL);
7670 
7671       lea(ary1, Address(ary1, len, Address::times_1));
7672       negptr(len);
7673 
7674       movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
7675       movdl(vec2, tmp1);
7676       vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
7677 
7678       bind(COMPARE_WIDE_VECTORS);
7679       vmovdqu(vec1, Address(ary1, len, Address::times_1));
7680       vptest(vec1, vec2);
7681       jccb(Assembler::notZero, TRUE_LABEL);
7682       addptr(len, 32);
7683       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7684 
7685       testl(result, result);
7686       jccb(Assembler::zero, FALSE_LABEL);
7687 
7688       vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
7689       vptest(vec1, vec2);
7690       jccb(Assembler::notZero, TRUE_LABEL);
7691       jmpb(FALSE_LABEL);
7692 
7693       bind(COMPARE_TAIL); // len is zero
7694       movl(len, result);
7695       // Fallthru to tail compare
7696     } else if (UseSSE42Intrinsics) {
7697       // With SSE4.2, use double quad vector compare
7698       Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7699 
7700       // Compare 16-byte vectors
7701       andl(result, 0x0000000f);  //   tail count (in bytes)
7702       andl(len, 0xfffffff0);   // vector count (in bytes)
7703       jcc(Assembler::zero, COMPARE_TAIL);
7704 
7705       lea(ary1, Address(ary1, len, Address::times_1));
7706       negptr(len);
7707 
7708       movl(tmp1, 0x80808080);
7709       movdl(vec2, tmp1);
7710       pshufd(vec2, vec2, 0);
7711 
7712       bind(COMPARE_WIDE_VECTORS);
7713       movdqu(vec1, Address(ary1, len, Address::times_1));
7714       ptest(vec1, vec2);
7715       jcc(Assembler::notZero, TRUE_LABEL);
7716       addptr(len, 16);
7717       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7718 
7719       testl(result, result);
7720       jcc(Assembler::zero, FALSE_LABEL);
7721 
7722       movdqu(vec1, Address(ary1, result, Address::times_1, -16));
7723       ptest(vec1, vec2);
7724       jccb(Assembler::notZero, TRUE_LABEL);
7725       jmpb(FALSE_LABEL);
7726 
7727       bind(COMPARE_TAIL); // len is zero
7728       movl(len, result);
7729       // Fallthru to tail compare
7730     }
7731   }
7732   // Compare 4-byte vectors
7733   andl(len, 0xfffffffc); // vector count (in bytes)
7734   jccb(Assembler::zero, COMPARE_CHAR);
7735 
7736   lea(ary1, Address(ary1, len, Address::times_1));
7737   negptr(len);
7738 
7739   bind(COMPARE_VECTORS);
7740   movl(tmp1, Address(ary1, len, Address::times_1));
7741   andl(tmp1, 0x80808080);
7742   jccb(Assembler::notZero, TRUE_LABEL);
7743   addptr(len, 4);
7744   jcc(Assembler::notZero, COMPARE_VECTORS);
7745 
7746   // Compare trailing char (final 2 bytes), if any
7747   bind(COMPARE_CHAR);
7748   testl(result, 0x2);   // tail  char
7749   jccb(Assembler::zero, COMPARE_BYTE);
7750   load_unsigned_short(tmp1, Address(ary1, 0));
7751   andl(tmp1, 0x00008080);
7752   jccb(Assembler::notZero, TRUE_LABEL);
7753   subptr(result, 2);
7754   lea(ary1, Address(ary1, 2));
7755 
7756   bind(COMPARE_BYTE);
7757   testl(result, 0x1);   // tail  byte
7758   jccb(Assembler::zero, FALSE_LABEL);
7759   load_unsigned_byte(tmp1, Address(ary1, 0));
7760   andl(tmp1, 0x00000080);
7761   jccb(Assembler::notEqual, TRUE_LABEL);
7762   jmpb(FALSE_LABEL);
7763 
7764   bind(TRUE_LABEL);
7765   movl(result, 1);   // return true
7766   jmpb(DONE);
7767 
7768   bind(FALSE_LABEL);
7769   xorl(result, result); // return false
7770 
7771   // That's it
7772   bind(DONE);
7773   if (UseAVX >= 2 && UseSSE >= 2) {
7774     // clean upper bits of YMM registers
7775     vpxor(vec1, vec1);
7776     vpxor(vec2, vec2);
7777   }
7778 }
7779 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
7780 void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
7781                                    Register limit, Register result, Register chr,
7782                                    XMMRegister vec1, XMMRegister vec2, bool is_char) {
7783   ShortBranchVerifier sbv(this);
7784   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
7785 
7786   int length_offset  = arrayOopDesc::length_offset_in_bytes();
7787   int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
7788 
7789   if (is_array_equ) {
7790     // Check the input args
7791     cmpoop(ary1, ary2);
7792     jcc(Assembler::equal, TRUE_LABEL);
7793 
7794     // Need additional checks for arrays_equals.
7795     testptr(ary1, ary1);
7796     jcc(Assembler::zero, FALSE_LABEL);
7797     testptr(ary2, ary2);
7798     jcc(Assembler::zero, FALSE_LABEL);
7799 
7800     // Check the lengths
7801     movl(limit, Address(ary1, length_offset));
7802     cmpl(limit, Address(ary2, length_offset));
7803     jcc(Assembler::notEqual, FALSE_LABEL);
7804   }
7805 
7806   // count == 0
7807   testl(limit, limit);
7808   jcc(Assembler::zero, TRUE_LABEL);
7809 
7810   if (is_array_equ) {
7811     // Load array address
7812     lea(ary1, Address(ary1, base_offset));
7813     lea(ary2, Address(ary2, base_offset));
7814   }
7815 
7816   if (is_array_equ && is_char) {
7817     // arrays_equals when used for char[].
7818     shll(limit, 1);      // byte count != 0
7819   }
7820   movl(result, limit); // copy
7821 
7822   if (UseAVX >= 2) {
7823     // With AVX2, use 32-byte vector compare
7824     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7825 
7826     // Compare 32-byte vectors
7827     andl(result, 0x0000001f);  //   tail count (in bytes)
7828     andl(limit, 0xffffffe0);   // vector count (in bytes)
7829     jcc(Assembler::zero, COMPARE_TAIL);
7830 
7831     lea(ary1, Address(ary1, limit, Address::times_1));
7832     lea(ary2, Address(ary2, limit, Address::times_1));
7833     negptr(limit);
7834 
7835     bind(COMPARE_WIDE_VECTORS);
7836 
7837 #ifdef _LP64
7838     if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
7839       Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
7840 
7841       cmpl(limit, -64);
7842       jccb(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
7843 
7844       bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
7845 
7846       evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
7847       evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
7848       kortestql(k7, k7);
7849       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
7850       addptr(limit, 64);  // update since we already compared at this addr
7851       cmpl(limit, -64);
7852       jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
7853 
7854       // At this point we may still need to compare -limit+result bytes.
7855       // We could execute the next two instruction and just continue via non-wide path:
7856       //  cmpl(limit, 0);
7857       //  jcc(Assembler::equal, COMPARE_TAIL);  // true
7858       // But since we stopped at the points ary{1,2}+limit which are
7859       // not farther than 64 bytes from the ends of arrays ary{1,2}+result
7860       // (|limit| <= 32 and result < 32),
7861       // we may just compare the last 64 bytes.
7862       //
7863       addptr(result, -64);   // it is safe, bc we just came from this area
7864       evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
7865       evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
7866       kortestql(k7, k7);
7867       jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
7868 
7869       jmp(TRUE_LABEL);
7870 
7871       bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
7872 
7873     }//if (VM_Version::supports_avx512vlbw())
7874 #endif //_LP64
7875 
7876     vmovdqu(vec1, Address(ary1, limit, Address::times_1));
7877     vmovdqu(vec2, Address(ary2, limit, Address::times_1));
7878     vpxor(vec1, vec2);
7879 
7880     vptest(vec1, vec1);
7881     jcc(Assembler::notZero, FALSE_LABEL);
7882     addptr(limit, 32);
7883     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7884 
7885     testl(result, result);
7886     jcc(Assembler::zero, TRUE_LABEL);
7887 
7888     vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
7889     vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
7890     vpxor(vec1, vec2);
7891 
7892     vptest(vec1, vec1);
7893     jccb(Assembler::notZero, FALSE_LABEL);
7894     jmpb(TRUE_LABEL);
7895 
7896     bind(COMPARE_TAIL); // limit is zero
7897     movl(limit, result);
7898     // Fallthru to tail compare
7899   } else if (UseSSE42Intrinsics) {
7900     // With SSE4.2, use double quad vector compare
7901     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
7902 
7903     // Compare 16-byte vectors
7904     andl(result, 0x0000000f);  //   tail count (in bytes)
7905     andl(limit, 0xfffffff0);   // vector count (in bytes)
7906     jcc(Assembler::zero, COMPARE_TAIL);
7907 
7908     lea(ary1, Address(ary1, limit, Address::times_1));
7909     lea(ary2, Address(ary2, limit, Address::times_1));
7910     negptr(limit);
7911 
7912     bind(COMPARE_WIDE_VECTORS);
7913     movdqu(vec1, Address(ary1, limit, Address::times_1));
7914     movdqu(vec2, Address(ary2, limit, Address::times_1));
7915     pxor(vec1, vec2);
7916 
7917     ptest(vec1, vec1);
7918     jcc(Assembler::notZero, FALSE_LABEL);
7919     addptr(limit, 16);
7920     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
7921 
7922     testl(result, result);
7923     jcc(Assembler::zero, TRUE_LABEL);
7924 
7925     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
7926     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
7927     pxor(vec1, vec2);
7928 
7929     ptest(vec1, vec1);
7930     jccb(Assembler::notZero, FALSE_LABEL);
7931     jmpb(TRUE_LABEL);
7932 
7933     bind(COMPARE_TAIL); // limit is zero
7934     movl(limit, result);
7935     // Fallthru to tail compare
7936   }
7937 
7938   // Compare 4-byte vectors
7939   andl(limit, 0xfffffffc); // vector count (in bytes)
7940   jccb(Assembler::zero, COMPARE_CHAR);
7941 
7942   lea(ary1, Address(ary1, limit, Address::times_1));
7943   lea(ary2, Address(ary2, limit, Address::times_1));
7944   negptr(limit);
7945 
7946   bind(COMPARE_VECTORS);
7947   movl(chr, Address(ary1, limit, Address::times_1));
7948   cmpl(chr, Address(ary2, limit, Address::times_1));
7949   jccb(Assembler::notEqual, FALSE_LABEL);
7950   addptr(limit, 4);
7951   jcc(Assembler::notZero, COMPARE_VECTORS);
7952 
7953   // Compare trailing char (final 2 bytes), if any
7954   bind(COMPARE_CHAR);
7955   testl(result, 0x2);   // tail  char
7956   jccb(Assembler::zero, COMPARE_BYTE);
7957   load_unsigned_short(chr, Address(ary1, 0));
7958   load_unsigned_short(limit, Address(ary2, 0));
7959   cmpl(chr, limit);
7960   jccb(Assembler::notEqual, FALSE_LABEL);
7961 
7962   if (is_array_equ && is_char) {
7963     bind(COMPARE_BYTE);
7964   } else {
7965     lea(ary1, Address(ary1, 2));
7966     lea(ary2, Address(ary2, 2));
7967 
7968     bind(COMPARE_BYTE);
7969     testl(result, 0x1);   // tail  byte
7970     jccb(Assembler::zero, TRUE_LABEL);
7971     load_unsigned_byte(chr, Address(ary1, 0));
7972     load_unsigned_byte(limit, Address(ary2, 0));
7973     cmpl(chr, limit);
7974     jccb(Assembler::notEqual, FALSE_LABEL);
7975   }
7976   bind(TRUE_LABEL);
7977   movl(result, 1);   // return true
7978   jmpb(DONE);
7979 
7980   bind(FALSE_LABEL);
7981   xorl(result, result); // return false
7982 
7983   // That's it
7984   bind(DONE);
7985   if (UseAVX >= 2) {
7986     // clean upper bits of YMM registers
7987     vpxor(vec1, vec1);
7988     vpxor(vec2, vec2);
7989   }
7990 }
7991 
7992 #endif
7993 
7994 void MacroAssembler::generate_fill(BasicType t, bool aligned,
7995                                    Register to, Register value, Register count,
7996                                    Register rtmp, XMMRegister xtmp) {
7997   ShortBranchVerifier sbv(this);
7998   assert_different_registers(to, value, count, rtmp);
7999   Label L_exit;
8000   Label L_fill_2_bytes, L_fill_4_bytes;
8001 
8002   int shift = -1;
8003   switch (t) {
8004     case T_BYTE:
8005       shift = 2;
8006       break;
8007     case T_SHORT:
8008       shift = 1;
8009       break;
8010     case T_INT:
8011       shift = 0;
8012       break;
8013     default: ShouldNotReachHere();
8014   }
8015 
8016   if (t == T_BYTE) {
8017     andl(value, 0xff);
8018     movl(rtmp, value);
8019     shll(rtmp, 8);
8020     orl(value, rtmp);
8021   }
8022   if (t == T_SHORT) {
8023     andl(value, 0xffff);
8024   }
8025   if (t == T_BYTE || t == T_SHORT) {
8026     movl(rtmp, value);
8027     shll(rtmp, 16);
8028     orl(value, rtmp);
8029   }
8030 
8031   cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
8032   jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
8033   if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
8034     Label L_skip_align2;
8035     // align source address at 4 bytes address boundary
8036     if (t == T_BYTE) {
8037       Label L_skip_align1;
8038       // One byte misalignment happens only for byte arrays
8039       testptr(to, 1);
8040       jccb(Assembler::zero, L_skip_align1);
8041       movb(Address(to, 0), value);
8042       increment(to);
8043       decrement(count);
8044       BIND(L_skip_align1);
8045     }
8046     // Two bytes misalignment happens only for byte and short (char) arrays
8047     testptr(to, 2);
8048     jccb(Assembler::zero, L_skip_align2);
8049     movw(Address(to, 0), value);
8050     addptr(to, 2);
8051     subl(count, 1<<(shift-1));
8052     BIND(L_skip_align2);
8053   }
8054   if (UseSSE < 2) {
8055     Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
8056     // Fill 32-byte chunks
8057     subl(count, 8 << shift);
8058     jcc(Assembler::less, L_check_fill_8_bytes);
8059     align(16);
8060 
8061     BIND(L_fill_32_bytes_loop);
8062 
8063     for (int i = 0; i < 32; i += 4) {
8064       movl(Address(to, i), value);
8065     }
8066 
8067     addptr(to, 32);
8068     subl(count, 8 << shift);
8069     jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
8070     BIND(L_check_fill_8_bytes);
8071     addl(count, 8 << shift);
8072     jccb(Assembler::zero, L_exit);
8073     jmpb(L_fill_8_bytes);
8074 
8075     //
8076     // length is too short, just fill qwords
8077     //
8078     BIND(L_fill_8_bytes_loop);
8079     movl(Address(to, 0), value);
8080     movl(Address(to, 4), value);
8081     addptr(to, 8);
8082     BIND(L_fill_8_bytes);
8083     subl(count, 1 << (shift + 1));
8084     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
8085     // fall through to fill 4 bytes
8086   } else {
8087     Label L_fill_32_bytes;
8088     if (!UseUnalignedLoadStores) {
8089       // align to 8 bytes, we know we are 4 byte aligned to start
8090       testptr(to, 4);
8091       jccb(Assembler::zero, L_fill_32_bytes);
8092       movl(Address(to, 0), value);
8093       addptr(to, 4);
8094       subl(count, 1<<shift);
8095     }
8096     BIND(L_fill_32_bytes);
8097     {
8098       assert( UseSSE >= 2, "supported cpu only" );
8099       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
8100       movdl(xtmp, value);
8101       if (UseAVX > 2 && UseUnalignedLoadStores) {
8102         // Fill 64-byte chunks
8103         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
8104         vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
8105 
8106         subl(count, 16 << shift);
8107         jcc(Assembler::less, L_check_fill_32_bytes);
8108         align(16);
8109 
8110         BIND(L_fill_64_bytes_loop);
8111         evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
8112         addptr(to, 64);
8113         subl(count, 16 << shift);
8114         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
8115 
8116         BIND(L_check_fill_32_bytes);
8117         addl(count, 8 << shift);
8118         jccb(Assembler::less, L_check_fill_8_bytes);
8119         vmovdqu(Address(to, 0), xtmp);
8120         addptr(to, 32);
8121         subl(count, 8 << shift);
8122 
8123         BIND(L_check_fill_8_bytes);
8124       } else if (UseAVX == 2 && UseUnalignedLoadStores) {
8125         // Fill 64-byte chunks
8126         Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
8127         vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
8128 
8129         subl(count, 16 << shift);
8130         jcc(Assembler::less, L_check_fill_32_bytes);
8131         align(16);
8132 
8133         BIND(L_fill_64_bytes_loop);
8134         vmovdqu(Address(to, 0), xtmp);
8135         vmovdqu(Address(to, 32), xtmp);
8136         addptr(to, 64);
8137         subl(count, 16 << shift);
8138         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
8139 
8140         BIND(L_check_fill_32_bytes);
8141         addl(count, 8 << shift);
8142         jccb(Assembler::less, L_check_fill_8_bytes);
8143         vmovdqu(Address(to, 0), xtmp);
8144         addptr(to, 32);
8145         subl(count, 8 << shift);
8146 
8147         BIND(L_check_fill_8_bytes);
8148         // clean upper bits of YMM registers
8149         movdl(xtmp, value);
8150         pshufd(xtmp, xtmp, 0);
8151       } else {
8152         // Fill 32-byte chunks
8153         pshufd(xtmp, xtmp, 0);
8154 
8155         subl(count, 8 << shift);
8156         jcc(Assembler::less, L_check_fill_8_bytes);
8157         align(16);
8158 
8159         BIND(L_fill_32_bytes_loop);
8160 
8161         if (UseUnalignedLoadStores) {
8162           movdqu(Address(to, 0), xtmp);
8163           movdqu(Address(to, 16), xtmp);
8164         } else {
8165           movq(Address(to, 0), xtmp);
8166           movq(Address(to, 8), xtmp);
8167           movq(Address(to, 16), xtmp);
8168           movq(Address(to, 24), xtmp);
8169         }
8170 
8171         addptr(to, 32);
8172         subl(count, 8 << shift);
8173         jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
8174 
8175         BIND(L_check_fill_8_bytes);
8176       }
8177       addl(count, 8 << shift);
8178       jccb(Assembler::zero, L_exit);
8179       jmpb(L_fill_8_bytes);
8180 
8181       //
8182       // length is too short, just fill qwords
8183       //
8184       BIND(L_fill_8_bytes_loop);
8185       movq(Address(to, 0), xtmp);
8186       addptr(to, 8);
8187       BIND(L_fill_8_bytes);
8188       subl(count, 1 << (shift + 1));
8189       jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
8190     }
8191   }
8192   // fill trailing 4 bytes
8193   BIND(L_fill_4_bytes);
8194   testl(count, 1<<shift);
8195   jccb(Assembler::zero, L_fill_2_bytes);
8196   movl(Address(to, 0), value);
8197   if (t == T_BYTE || t == T_SHORT) {
8198     Label L_fill_byte;
8199     addptr(to, 4);
8200     BIND(L_fill_2_bytes);
8201     // fill trailing 2 bytes
8202     testl(count, 1<<(shift-1));
8203     jccb(Assembler::zero, L_fill_byte);
8204     movw(Address(to, 0), value);
8205     if (t == T_BYTE) {
8206       addptr(to, 2);
8207       BIND(L_fill_byte);
8208       // fill trailing byte
8209       testl(count, 1);
8210       jccb(Assembler::zero, L_exit);
8211       movb(Address(to, 0), value);
8212     } else {
8213       BIND(L_fill_byte);
8214     }
8215   } else {
8216     BIND(L_fill_2_bytes);
8217   }
8218   BIND(L_exit);
8219 }
8220 
8221 // encode char[] to byte[] in ISO_8859_1
8222    //@HotSpotIntrinsicCandidate
8223    //private static int implEncodeISOArray(byte[] sa, int sp,
8224    //byte[] da, int dp, int len) {
8225    //  int i = 0;
8226    //  for (; i < len; i++) {
8227    //    char c = StringUTF16.getChar(sa, sp++);
8228    //    if (c > '\u00FF')
8229    //      break;
8230    //    da[dp++] = (byte)c;
8231    //  }
8232    //  return i;
8233    //}
8234 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
8235   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8236   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8237   Register tmp5, Register result) {
8238 
8239   // rsi: src
8240   // rdi: dst
8241   // rdx: len
8242   // rcx: tmp5
8243   // rax: result
8244   ShortBranchVerifier sbv(this);
8245   assert_different_registers(src, dst, len, tmp5, result);
8246   Label L_done, L_copy_1_char, L_copy_1_char_exit;
8247 
8248   // set result
8249   xorl(result, result);
8250   // check for zero length
8251   testl(len, len);
8252   jcc(Assembler::zero, L_done);
8253 
8254   movl(result, len);
8255 
8256   // Setup pointers
8257   lea(src, Address(src, len, Address::times_2)); // char[]
8258   lea(dst, Address(dst, len, Address::times_1)); // byte[]
8259   negptr(len);
8260 
8261   if (UseSSE42Intrinsics || UseAVX >= 2) {
8262     Label L_copy_8_chars, L_copy_8_chars_exit;
8263     Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
8264 
8265     if (UseAVX >= 2) {
8266       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
8267       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
8268       movdl(tmp1Reg, tmp5);
8269       vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
8270       jmp(L_chars_32_check);
8271 
8272       bind(L_copy_32_chars);
8273       vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
8274       vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
8275       vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
8276       vptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
8277       jccb(Assembler::notZero, L_copy_32_chars_exit);
8278       vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
8279       vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
8280       vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
8281 
8282       bind(L_chars_32_check);
8283       addptr(len, 32);
8284       jcc(Assembler::lessEqual, L_copy_32_chars);
8285 
8286       bind(L_copy_32_chars_exit);
8287       subptr(len, 16);
8288       jccb(Assembler::greater, L_copy_16_chars_exit);
8289 
8290     } else if (UseSSE42Intrinsics) {
8291       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
8292       movdl(tmp1Reg, tmp5);
8293       pshufd(tmp1Reg, tmp1Reg, 0);
8294       jmpb(L_chars_16_check);
8295     }
8296 
8297     bind(L_copy_16_chars);
8298     if (UseAVX >= 2) {
8299       vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
8300       vptest(tmp2Reg, tmp1Reg);
8301       jcc(Assembler::notZero, L_copy_16_chars_exit);
8302       vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
8303       vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
8304     } else {
8305       if (UseAVX > 0) {
8306         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
8307         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
8308         vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
8309       } else {
8310         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
8311         por(tmp2Reg, tmp3Reg);
8312         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
8313         por(tmp2Reg, tmp4Reg);
8314       }
8315       ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
8316       jccb(Assembler::notZero, L_copy_16_chars_exit);
8317       packuswb(tmp3Reg, tmp4Reg);
8318     }
8319     movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
8320 
8321     bind(L_chars_16_check);
8322     addptr(len, 16);
8323     jcc(Assembler::lessEqual, L_copy_16_chars);
8324 
8325     bind(L_copy_16_chars_exit);
8326     if (UseAVX >= 2) {
8327       // clean upper bits of YMM registers
8328       vpxor(tmp2Reg, tmp2Reg);
8329       vpxor(tmp3Reg, tmp3Reg);
8330       vpxor(tmp4Reg, tmp4Reg);
8331       movdl(tmp1Reg, tmp5);
8332       pshufd(tmp1Reg, tmp1Reg, 0);
8333     }
8334     subptr(len, 8);
8335     jccb(Assembler::greater, L_copy_8_chars_exit);
8336 
8337     bind(L_copy_8_chars);
8338     movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
8339     ptest(tmp3Reg, tmp1Reg);
8340     jccb(Assembler::notZero, L_copy_8_chars_exit);
8341     packuswb(tmp3Reg, tmp1Reg);
8342     movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
8343     addptr(len, 8);
8344     jccb(Assembler::lessEqual, L_copy_8_chars);
8345 
8346     bind(L_copy_8_chars_exit);
8347     subptr(len, 8);
8348     jccb(Assembler::zero, L_done);
8349   }
8350 
8351   bind(L_copy_1_char);
8352   load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
8353   testl(tmp5, 0xff00);      // check if Unicode char
8354   jccb(Assembler::notZero, L_copy_1_char_exit);
8355   movb(Address(dst, len, Address::times_1, 0), tmp5);
8356   addptr(len, 1);
8357   jccb(Assembler::less, L_copy_1_char);
8358 
8359   bind(L_copy_1_char_exit);
8360   addptr(result, len); // len is negative count of not processed elements
8361 
8362   bind(L_done);
8363 }
8364 
8365 #ifdef _LP64
8366 /**
8367  * Helper for multiply_to_len().
8368  */
8369 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
8370   addq(dest_lo, src1);
8371   adcq(dest_hi, 0);
8372   addq(dest_lo, src2);
8373   adcq(dest_hi, 0);
8374 }
8375 
8376 /**
8377  * Multiply 64 bit by 64 bit first loop.
8378  */
8379 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
8380                                            Register y, Register y_idx, Register z,
8381                                            Register carry, Register product,
8382                                            Register idx, Register kdx) {
8383   //
8384   //  jlong carry, x[], y[], z[];
8385   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
8386   //    huge_128 product = y[idx] * x[xstart] + carry;
8387   //    z[kdx] = (jlong)product;
8388   //    carry  = (jlong)(product >>> 64);
8389   //  }
8390   //  z[xstart] = carry;
8391   //
8392 
8393   Label L_first_loop, L_first_loop_exit;
8394   Label L_one_x, L_one_y, L_multiply;
8395 
8396   decrementl(xstart);
8397   jcc(Assembler::negative, L_one_x);
8398 
8399   movq(x_xstart, Address(x, xstart, Address::times_4,  0));
8400   rorq(x_xstart, 32); // convert big-endian to little-endian
8401 
8402   bind(L_first_loop);
8403   decrementl(idx);
8404   jcc(Assembler::negative, L_first_loop_exit);
8405   decrementl(idx);
8406   jcc(Assembler::negative, L_one_y);
8407   movq(y_idx, Address(y, idx, Address::times_4,  0));
8408   rorq(y_idx, 32); // convert big-endian to little-endian
8409   bind(L_multiply);
8410   movq(product, x_xstart);
8411   mulq(y_idx); // product(rax) * y_idx -> rdx:rax
8412   addq(product, carry);
8413   adcq(rdx, 0);
8414   subl(kdx, 2);
8415   movl(Address(z, kdx, Address::times_4,  4), product);
8416   shrq(product, 32);
8417   movl(Address(z, kdx, Address::times_4,  0), product);
8418   movq(carry, rdx);
8419   jmp(L_first_loop);
8420 
8421   bind(L_one_y);
8422   movl(y_idx, Address(y,  0));
8423   jmp(L_multiply);
8424 
8425   bind(L_one_x);
8426   movl(x_xstart, Address(x,  0));
8427   jmp(L_first_loop);
8428 
8429   bind(L_first_loop_exit);
8430 }
8431 
8432 /**
8433  * Multiply 64 bit by 64 bit and add 128 bit.
8434  */
8435 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
8436                                             Register yz_idx, Register idx,
8437                                             Register carry, Register product, int offset) {
8438   //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
8439   //     z[kdx] = (jlong)product;
8440 
8441   movq(yz_idx, Address(y, idx, Address::times_4,  offset));
8442   rorq(yz_idx, 32); // convert big-endian to little-endian
8443   movq(product, x_xstart);
8444   mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
8445   movq(yz_idx, Address(z, idx, Address::times_4,  offset));
8446   rorq(yz_idx, 32); // convert big-endian to little-endian
8447 
8448   add2_with_carry(rdx, product, carry, yz_idx);
8449 
8450   movl(Address(z, idx, Address::times_4,  offset+4), product);
8451   shrq(product, 32);
8452   movl(Address(z, idx, Address::times_4,  offset), product);
8453 
8454 }
8455 
8456 /**
8457  * Multiply 128 bit by 128 bit. Unrolled inner loop.
8458  */
8459 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
8460                                              Register yz_idx, Register idx, Register jdx,
8461                                              Register carry, Register product,
8462                                              Register carry2) {
8463   //   jlong carry, x[], y[], z[];
8464   //   int kdx = ystart+1;
8465   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
8466   //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
8467   //     z[kdx+idx+1] = (jlong)product;
8468   //     jlong carry2  = (jlong)(product >>> 64);
8469   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
8470   //     z[kdx+idx] = (jlong)product;
8471   //     carry  = (jlong)(product >>> 64);
8472   //   }
8473   //   idx += 2;
8474   //   if (idx > 0) {
8475   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
8476   //     z[kdx+idx] = (jlong)product;
8477   //     carry  = (jlong)(product >>> 64);
8478   //   }
8479   //
8480 
8481   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
8482 
8483   movl(jdx, idx);
8484   andl(jdx, 0xFFFFFFFC);
8485   shrl(jdx, 2);
8486 
8487   bind(L_third_loop);
8488   subl(jdx, 1);
8489   jcc(Assembler::negative, L_third_loop_exit);
8490   subl(idx, 4);
8491 
8492   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
8493   movq(carry2, rdx);
8494 
8495   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
8496   movq(carry, rdx);
8497   jmp(L_third_loop);
8498 
8499   bind (L_third_loop_exit);
8500 
8501   andl (idx, 0x3);
8502   jcc(Assembler::zero, L_post_third_loop_done);
8503 
8504   Label L_check_1;
8505   subl(idx, 2);
8506   jcc(Assembler::negative, L_check_1);
8507 
8508   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
8509   movq(carry, rdx);
8510 
8511   bind (L_check_1);
8512   addl (idx, 0x2);
8513   andl (idx, 0x1);
8514   subl(idx, 1);
8515   jcc(Assembler::negative, L_post_third_loop_done);
8516 
8517   movl(yz_idx, Address(y, idx, Address::times_4,  0));
8518   movq(product, x_xstart);
8519   mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
8520   movl(yz_idx, Address(z, idx, Address::times_4,  0));
8521 
8522   add2_with_carry(rdx, product, yz_idx, carry);
8523 
8524   movl(Address(z, idx, Address::times_4,  0), product);
8525   shrq(product, 32);
8526 
8527   shlq(rdx, 32);
8528   orq(product, rdx);
8529   movq(carry, product);
8530 
8531   bind(L_post_third_loop_done);
8532 }
8533 
8534 /**
8535  * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
8536  *
8537  */
8538 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
8539                                                   Register carry, Register carry2,
8540                                                   Register idx, Register jdx,
8541                                                   Register yz_idx1, Register yz_idx2,
8542                                                   Register tmp, Register tmp3, Register tmp4) {
8543   assert(UseBMI2Instructions, "should be used only when BMI2 is available");
8544 
8545   //   jlong carry, x[], y[], z[];
8546   //   int kdx = ystart+1;
8547   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
8548   //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
8549   //     jlong carry2  = (jlong)(tmp3 >>> 64);
8550   //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
8551   //     carry  = (jlong)(tmp4 >>> 64);
8552   //     z[kdx+idx+1] = (jlong)tmp3;
8553   //     z[kdx+idx] = (jlong)tmp4;
8554   //   }
8555   //   idx += 2;
8556   //   if (idx > 0) {
8557   //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
8558   //     z[kdx+idx] = (jlong)yz_idx1;
8559   //     carry  = (jlong)(yz_idx1 >>> 64);
8560   //   }
8561   //
8562 
8563   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
8564 
8565   movl(jdx, idx);
8566   andl(jdx, 0xFFFFFFFC);
8567   shrl(jdx, 2);
8568 
8569   bind(L_third_loop);
8570   subl(jdx, 1);
8571   jcc(Assembler::negative, L_third_loop_exit);
8572   subl(idx, 4);
8573 
8574   movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
8575   rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
8576   movq(yz_idx2, Address(y, idx, Address::times_4,  0));
8577   rorxq(yz_idx2, yz_idx2, 32);
8578 
8579   mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
8580   mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
8581 
8582   movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
8583   rorxq(yz_idx1, yz_idx1, 32);
8584   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
8585   rorxq(yz_idx2, yz_idx2, 32);
8586 
8587   if (VM_Version::supports_adx()) {
8588     adcxq(tmp3, carry);
8589     adoxq(tmp3, yz_idx1);
8590 
8591     adcxq(tmp4, tmp);
8592     adoxq(tmp4, yz_idx2);
8593 
8594     movl(carry, 0); // does not affect flags
8595     adcxq(carry2, carry);
8596     adoxq(carry2, carry);
8597   } else {
8598     add2_with_carry(tmp4, tmp3, carry, yz_idx1);
8599     add2_with_carry(carry2, tmp4, tmp, yz_idx2);
8600   }
8601   movq(carry, carry2);
8602 
8603   movl(Address(z, idx, Address::times_4, 12), tmp3);
8604   shrq(tmp3, 32);
8605   movl(Address(z, idx, Address::times_4,  8), tmp3);
8606 
8607   movl(Address(z, idx, Address::times_4,  4), tmp4);
8608   shrq(tmp4, 32);
8609   movl(Address(z, idx, Address::times_4,  0), tmp4);
8610 
8611   jmp(L_third_loop);
8612 
8613   bind (L_third_loop_exit);
8614 
8615   andl (idx, 0x3);
8616   jcc(Assembler::zero, L_post_third_loop_done);
8617 
8618   Label L_check_1;
8619   subl(idx, 2);
8620   jcc(Assembler::negative, L_check_1);
8621 
8622   movq(yz_idx1, Address(y, idx, Address::times_4,  0));
8623   rorxq(yz_idx1, yz_idx1, 32);
8624   mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
8625   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
8626   rorxq(yz_idx2, yz_idx2, 32);
8627 
8628   add2_with_carry(tmp4, tmp3, carry, yz_idx2);
8629 
8630   movl(Address(z, idx, Address::times_4,  4), tmp3);
8631   shrq(tmp3, 32);
8632   movl(Address(z, idx, Address::times_4,  0), tmp3);
8633   movq(carry, tmp4);
8634 
8635   bind (L_check_1);
8636   addl (idx, 0x2);
8637   andl (idx, 0x1);
8638   subl(idx, 1);
8639   jcc(Assembler::negative, L_post_third_loop_done);
8640   movl(tmp4, Address(y, idx, Address::times_4,  0));
8641   mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
8642   movl(tmp4, Address(z, idx, Address::times_4,  0));
8643 
8644   add2_with_carry(carry2, tmp3, tmp4, carry);
8645 
8646   movl(Address(z, idx, Address::times_4,  0), tmp3);
8647   shrq(tmp3, 32);
8648 
8649   shlq(carry2, 32);
8650   orq(tmp3, carry2);
8651   movq(carry, tmp3);
8652 
8653   bind(L_post_third_loop_done);
8654 }
8655 
8656 /**
8657  * Code for BigInteger::multiplyToLen() instrinsic.
8658  *
8659  * rdi: x
8660  * rax: xlen
8661  * rsi: y
8662  * rcx: ylen
8663  * r8:  z
8664  * r11: zlen
8665  * r12: tmp1
8666  * r13: tmp2
8667  * r14: tmp3
8668  * r15: tmp4
8669  * rbx: tmp5
8670  *
8671  */
8672 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
8673                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
8674   ShortBranchVerifier sbv(this);
8675   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
8676 
8677   push(tmp1);
8678   push(tmp2);
8679   push(tmp3);
8680   push(tmp4);
8681   push(tmp5);
8682 
8683   push(xlen);
8684   push(zlen);
8685 
8686   const Register idx = tmp1;
8687   const Register kdx = tmp2;
8688   const Register xstart = tmp3;
8689 
8690   const Register y_idx = tmp4;
8691   const Register carry = tmp5;
8692   const Register product  = xlen;
8693   const Register x_xstart = zlen;  // reuse register
8694 
8695   // First Loop.
8696   //
8697   //  final static long LONG_MASK = 0xffffffffL;
8698   //  int xstart = xlen - 1;
8699   //  int ystart = ylen - 1;
8700   //  long carry = 0;
8701   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
8702   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
8703   //    z[kdx] = (int)product;
8704   //    carry = product >>> 32;
8705   //  }
8706   //  z[xstart] = (int)carry;
8707   //
8708 
8709   movl(idx, ylen);      // idx = ylen;
8710   movl(kdx, zlen);      // kdx = xlen+ylen;
8711   xorq(carry, carry);   // carry = 0;
8712 
8713   Label L_done;
8714 
8715   movl(xstart, xlen);
8716   decrementl(xstart);
8717   jcc(Assembler::negative, L_done);
8718 
8719   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
8720 
8721   Label L_second_loop;
8722   testl(kdx, kdx);
8723   jcc(Assembler::zero, L_second_loop);
8724 
8725   Label L_carry;
8726   subl(kdx, 1);
8727   jcc(Assembler::zero, L_carry);
8728 
8729   movl(Address(z, kdx, Address::times_4,  0), carry);
8730   shrq(carry, 32);
8731   subl(kdx, 1);
8732 
8733   bind(L_carry);
8734   movl(Address(z, kdx, Address::times_4,  0), carry);
8735 
8736   // Second and third (nested) loops.
8737   //
8738   // for (int i = xstart-1; i >= 0; i--) { // Second loop
8739   //   carry = 0;
8740   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
8741   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
8742   //                    (z[k] & LONG_MASK) + carry;
8743   //     z[k] = (int)product;
8744   //     carry = product >>> 32;
8745   //   }
8746   //   z[i] = (int)carry;
8747   // }
8748   //
8749   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
8750 
8751   const Register jdx = tmp1;
8752 
8753   bind(L_second_loop);
8754   xorl(carry, carry);    // carry = 0;
8755   movl(jdx, ylen);       // j = ystart+1
8756 
8757   subl(xstart, 1);       // i = xstart-1;
8758   jcc(Assembler::negative, L_done);
8759 
8760   push (z);
8761 
8762   Label L_last_x;
8763   lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
8764   subl(xstart, 1);       // i = xstart-1;
8765   jcc(Assembler::negative, L_last_x);
8766 
8767   if (UseBMI2Instructions) {
8768     movq(rdx,  Address(x, xstart, Address::times_4,  0));
8769     rorxq(rdx, rdx, 32); // convert big-endian to little-endian
8770   } else {
8771     movq(x_xstart, Address(x, xstart, Address::times_4,  0));
8772     rorq(x_xstart, 32);  // convert big-endian to little-endian
8773   }
8774 
8775   Label L_third_loop_prologue;
8776   bind(L_third_loop_prologue);
8777 
8778   push (x);
8779   push (xstart);
8780   push (ylen);
8781 
8782 
8783   if (UseBMI2Instructions) {
8784     multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
8785   } else { // !UseBMI2Instructions
8786     multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
8787   }
8788 
8789   pop(ylen);
8790   pop(xlen);
8791   pop(x);
8792   pop(z);
8793 
8794   movl(tmp3, xlen);
8795   addl(tmp3, 1);
8796   movl(Address(z, tmp3, Address::times_4,  0), carry);
8797   subl(tmp3, 1);
8798   jccb(Assembler::negative, L_done);
8799 
8800   shrq(carry, 32);
8801   movl(Address(z, tmp3, Address::times_4,  0), carry);
8802   jmp(L_second_loop);
8803 
8804   // Next infrequent code is moved outside loops.
8805   bind(L_last_x);
8806   if (UseBMI2Instructions) {
8807     movl(rdx, Address(x,  0));
8808   } else {
8809     movl(x_xstart, Address(x,  0));
8810   }
8811   jmp(L_third_loop_prologue);
8812 
8813   bind(L_done);
8814 
8815   pop(zlen);
8816   pop(xlen);
8817 
8818   pop(tmp5);
8819   pop(tmp4);
8820   pop(tmp3);
8821   pop(tmp2);
8822   pop(tmp1);
8823 }
8824 
8825 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
8826   Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
8827   assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
8828   Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
8829   Label VECTOR8_TAIL, VECTOR4_TAIL;
8830   Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
8831   Label SAME_TILL_END, DONE;
8832   Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
8833 
8834   //scale is in rcx in both Win64 and Unix
8835   ShortBranchVerifier sbv(this);
8836 
8837   shlq(length);
8838   xorq(result, result);
8839 
8840   if ((UseAVX > 2) &&
8841       VM_Version::supports_avx512vlbw()) {
8842     Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
8843 
8844     cmpq(length, 64);
8845     jcc(Assembler::less, VECTOR32_TAIL);
8846     movq(tmp1, length);
8847     andq(tmp1, 0x3F);      // tail count
8848     andq(length, ~(0x3F)); //vector count
8849 
8850     bind(VECTOR64_LOOP);
8851     // AVX512 code to compare 64 byte vectors.
8852     evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit);
8853     evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
8854     kortestql(k7, k7);
8855     jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL);     // mismatch
8856     addq(result, 64);
8857     subq(length, 64);
8858     jccb(Assembler::notZero, VECTOR64_LOOP);
8859 
8860     //bind(VECTOR64_TAIL);
8861     testq(tmp1, tmp1);
8862     jcc(Assembler::zero, SAME_TILL_END);
8863 
8864     //bind(VECTOR64_TAIL);
8865     // AVX512 code to compare upto 63 byte vectors.
8866     mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
8867     shlxq(tmp2, tmp2, tmp1);
8868     notq(tmp2);
8869     kmovql(k3, tmp2);
8870 
8871     evmovdqub(rymm0, k3, Address(obja, result), Assembler::AVX_512bit);
8872     evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
8873 
8874     ktestql(k7, k3);
8875     jcc(Assembler::below, SAME_TILL_END);     // not mismatch
8876 
8877     bind(VECTOR64_NOT_EQUAL);
8878     kmovql(tmp1, k7);
8879     notq(tmp1);
8880     tzcntq(tmp1, tmp1);
8881     addq(result, tmp1);
8882     shrq(result);
8883     jmp(DONE);
8884     bind(VECTOR32_TAIL);
8885   }
8886 
8887   cmpq(length, 8);
8888   jcc(Assembler::equal, VECTOR8_LOOP);
8889   jcc(Assembler::less, VECTOR4_TAIL);
8890 
8891   if (UseAVX >= 2) {
8892     Label VECTOR16_TAIL, VECTOR32_LOOP;
8893 
8894     cmpq(length, 16);
8895     jcc(Assembler::equal, VECTOR16_LOOP);
8896     jcc(Assembler::less, VECTOR8_LOOP);
8897 
8898     cmpq(length, 32);
8899     jccb(Assembler::less, VECTOR16_TAIL);
8900 
8901     subq(length, 32);
8902     bind(VECTOR32_LOOP);
8903     vmovdqu(rymm0, Address(obja, result));
8904     vmovdqu(rymm1, Address(objb, result));
8905     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
8906     vptest(rymm2, rymm2);
8907     jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
8908     addq(result, 32);
8909     subq(length, 32);
8910     jcc(Assembler::greaterEqual, VECTOR32_LOOP);
8911     addq(length, 32);
8912     jcc(Assembler::equal, SAME_TILL_END);
8913     //falling through if less than 32 bytes left //close the branch here.
8914 
8915     bind(VECTOR16_TAIL);
8916     cmpq(length, 16);
8917     jccb(Assembler::less, VECTOR8_TAIL);
8918     bind(VECTOR16_LOOP);
8919     movdqu(rymm0, Address(obja, result));
8920     movdqu(rymm1, Address(objb, result));
8921     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
8922     ptest(rymm2, rymm2);
8923     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
8924     addq(result, 16);
8925     subq(length, 16);
8926     jcc(Assembler::equal, SAME_TILL_END);
8927     //falling through if less than 16 bytes left
8928   } else {//regular intrinsics
8929 
8930     cmpq(length, 16);
8931     jccb(Assembler::less, VECTOR8_TAIL);
8932 
8933     subq(length, 16);
8934     bind(VECTOR16_LOOP);
8935     movdqu(rymm0, Address(obja, result));
8936     movdqu(rymm1, Address(objb, result));
8937     pxor(rymm0, rymm1);
8938     ptest(rymm0, rymm0);
8939     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
8940     addq(result, 16);
8941     subq(length, 16);
8942     jccb(Assembler::greaterEqual, VECTOR16_LOOP);
8943     addq(length, 16);
8944     jcc(Assembler::equal, SAME_TILL_END);
8945     //falling through if less than 16 bytes left
8946   }
8947 
8948   bind(VECTOR8_TAIL);
8949   cmpq(length, 8);
8950   jccb(Assembler::less, VECTOR4_TAIL);
8951   bind(VECTOR8_LOOP);
8952   movq(tmp1, Address(obja, result));
8953   movq(tmp2, Address(objb, result));
8954   xorq(tmp1, tmp2);
8955   testq(tmp1, tmp1);
8956   jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
8957   addq(result, 8);
8958   subq(length, 8);
8959   jcc(Assembler::equal, SAME_TILL_END);
8960   //falling through if less than 8 bytes left
8961 
8962   bind(VECTOR4_TAIL);
8963   cmpq(length, 4);
8964   jccb(Assembler::less, BYTES_TAIL);
8965   bind(VECTOR4_LOOP);
8966   movl(tmp1, Address(obja, result));
8967   xorl(tmp1, Address(objb, result));
8968   testl(tmp1, tmp1);
8969   jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
8970   addq(result, 4);
8971   subq(length, 4);
8972   jcc(Assembler::equal, SAME_TILL_END);
8973   //falling through if less than 4 bytes left
8974 
8975   bind(BYTES_TAIL);
8976   bind(BYTES_LOOP);
8977   load_unsigned_byte(tmp1, Address(obja, result));
8978   load_unsigned_byte(tmp2, Address(objb, result));
8979   xorl(tmp1, tmp2);
8980   testl(tmp1, tmp1);
8981   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
8982   decq(length);
8983   jcc(Assembler::zero, SAME_TILL_END);
8984   incq(result);
8985   load_unsigned_byte(tmp1, Address(obja, result));
8986   load_unsigned_byte(tmp2, Address(objb, result));
8987   xorl(tmp1, tmp2);
8988   testl(tmp1, tmp1);
8989   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
8990   decq(length);
8991   jcc(Assembler::zero, SAME_TILL_END);
8992   incq(result);
8993   load_unsigned_byte(tmp1, Address(obja, result));
8994   load_unsigned_byte(tmp2, Address(objb, result));
8995   xorl(tmp1, tmp2);
8996   testl(tmp1, tmp1);
8997   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
8998   jmp(SAME_TILL_END);
8999 
9000   if (UseAVX >= 2) {
9001     bind(VECTOR32_NOT_EQUAL);
9002     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
9003     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
9004     vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
9005     vpmovmskb(tmp1, rymm0);
9006     bsfq(tmp1, tmp1);
9007     addq(result, tmp1);
9008     shrq(result);
9009     jmp(DONE);
9010   }
9011 
9012   bind(VECTOR16_NOT_EQUAL);
9013   if (UseAVX >= 2) {
9014     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
9015     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
9016     pxor(rymm0, rymm2);
9017   } else {
9018     pcmpeqb(rymm2, rymm2);
9019     pxor(rymm0, rymm1);
9020     pcmpeqb(rymm0, rymm1);
9021     pxor(rymm0, rymm2);
9022   }
9023   pmovmskb(tmp1, rymm0);
9024   bsfq(tmp1, tmp1);
9025   addq(result, tmp1);
9026   shrq(result);
9027   jmpb(DONE);
9028 
9029   bind(VECTOR8_NOT_EQUAL);
9030   bind(VECTOR4_NOT_EQUAL);
9031   bsfq(tmp1, tmp1);
9032   shrq(tmp1, 3);
9033   addq(result, tmp1);
9034   bind(BYTES_NOT_EQUAL);
9035   shrq(result);
9036   jmpb(DONE);
9037 
9038   bind(SAME_TILL_END);
9039   mov64(result, -1);
9040 
9041   bind(DONE);
9042 }
9043 
9044 //Helper functions for square_to_len()
9045 
9046 /**
9047  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
9048  * Preserves x and z and modifies rest of the registers.
9049  */
9050 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
9051   // Perform square and right shift by 1
9052   // Handle odd xlen case first, then for even xlen do the following
9053   // jlong carry = 0;
9054   // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
9055   //     huge_128 product = x[j:j+1] * x[j:j+1];
9056   //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
9057   //     z[i+2:i+3] = (jlong)(product >>> 1);
9058   //     carry = (jlong)product;
9059   // }
9060 
9061   xorq(tmp5, tmp5);     // carry
9062   xorq(rdxReg, rdxReg);
9063   xorl(tmp1, tmp1);     // index for x
9064   xorl(tmp4, tmp4);     // index for z
9065 
9066   Label L_first_loop, L_first_loop_exit;
9067 
9068   testl(xlen, 1);
9069   jccb(Assembler::zero, L_first_loop); //jump if xlen is even
9070 
9071   // Square and right shift by 1 the odd element using 32 bit multiply
9072   movl(raxReg, Address(x, tmp1, Address::times_4, 0));
9073   imulq(raxReg, raxReg);
9074   shrq(raxReg, 1);
9075   adcq(tmp5, 0);
9076   movq(Address(z, tmp4, Address::times_4, 0), raxReg);
9077   incrementl(tmp1);
9078   addl(tmp4, 2);
9079 
9080   // Square and  right shift by 1 the rest using 64 bit multiply
9081   bind(L_first_loop);
9082   cmpptr(tmp1, xlen);
9083   jccb(Assembler::equal, L_first_loop_exit);
9084 
9085   // Square
9086   movq(raxReg, Address(x, tmp1, Address::times_4,  0));
9087   rorq(raxReg, 32);    // convert big-endian to little-endian
9088   mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
9089 
9090   // Right shift by 1 and save carry
9091   shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
9092   rcrq(rdxReg, 1);
9093   rcrq(raxReg, 1);
9094   adcq(tmp5, 0);
9095 
9096   // Store result in z
9097   movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
9098   movq(Address(z, tmp4, Address::times_4, 8), raxReg);
9099 
9100   // Update indices for x and z
9101   addl(tmp1, 2);
9102   addl(tmp4, 4);
9103   jmp(L_first_loop);
9104 
9105   bind(L_first_loop_exit);
9106 }
9107 
9108 
9109 /**
9110  * Perform the following multiply add operation using BMI2 instructions
9111  * carry:sum = sum + op1*op2 + carry
9112  * op2 should be in rdx
9113  * op2 is preserved, all other registers are modified
9114  */
9115 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
9116   // assert op2 is rdx
9117   mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
9118   addq(sum, carry);
9119   adcq(tmp2, 0);
9120   addq(sum, op1);
9121   adcq(tmp2, 0);
9122   movq(carry, tmp2);
9123 }
9124 
9125 /**
9126  * Perform the following multiply add operation:
9127  * carry:sum = sum + op1*op2 + carry
9128  * Preserves op1, op2 and modifies rest of registers
9129  */
9130 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
9131   // rdx:rax = op1 * op2
9132   movq(raxReg, op2);
9133   mulq(op1);
9134 
9135   //  rdx:rax = sum + carry + rdx:rax
9136   addq(sum, carry);
9137   adcq(rdxReg, 0);
9138   addq(sum, raxReg);
9139   adcq(rdxReg, 0);
9140 
9141   // carry:sum = rdx:sum
9142   movq(carry, rdxReg);
9143 }
9144 
9145 /**
9146  * Add 64 bit long carry into z[] with carry propogation.
9147  * Preserves z and carry register values and modifies rest of registers.
9148  *
9149  */
9150 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
9151   Label L_fourth_loop, L_fourth_loop_exit;
9152 
9153   movl(tmp1, 1);
9154   subl(zlen, 2);
9155   addq(Address(z, zlen, Address::times_4, 0), carry);
9156 
9157   bind(L_fourth_loop);
9158   jccb(Assembler::carryClear, L_fourth_loop_exit);
9159   subl(zlen, 2);
9160   jccb(Assembler::negative, L_fourth_loop_exit);
9161   addq(Address(z, zlen, Address::times_4, 0), tmp1);
9162   jmp(L_fourth_loop);
9163   bind(L_fourth_loop_exit);
9164 }
9165 
9166 /**
9167  * Shift z[] left by 1 bit.
9168  * Preserves x, len, z and zlen registers and modifies rest of the registers.
9169  *
9170  */
9171 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
9172 
9173   Label L_fifth_loop, L_fifth_loop_exit;
9174 
9175   // Fifth loop
9176   // Perform primitiveLeftShift(z, zlen, 1)
9177 
9178   const Register prev_carry = tmp1;
9179   const Register new_carry = tmp4;
9180   const Register value = tmp2;
9181   const Register zidx = tmp3;
9182 
9183   // int zidx, carry;
9184   // long value;
9185   // carry = 0;
9186   // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
9187   //    (carry:value)  = (z[i] << 1) | carry ;
9188   //    z[i] = value;
9189   // }
9190 
9191   movl(zidx, zlen);
9192   xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
9193 
9194   bind(L_fifth_loop);
9195   decl(zidx);  // Use decl to preserve carry flag
9196   decl(zidx);
9197   jccb(Assembler::negative, L_fifth_loop_exit);
9198 
9199   if (UseBMI2Instructions) {
9200      movq(value, Address(z, zidx, Address::times_4, 0));
9201      rclq(value, 1);
9202      rorxq(value, value, 32);
9203      movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
9204   }
9205   else {
9206     // clear new_carry
9207     xorl(new_carry, new_carry);
9208 
9209     // Shift z[i] by 1, or in previous carry and save new carry
9210     movq(value, Address(z, zidx, Address::times_4, 0));
9211     shlq(value, 1);
9212     adcl(new_carry, 0);
9213 
9214     orq(value, prev_carry);
9215     rorq(value, 0x20);
9216     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
9217 
9218     // Set previous carry = new carry
9219     movl(prev_carry, new_carry);
9220   }
9221   jmp(L_fifth_loop);
9222 
9223   bind(L_fifth_loop_exit);
9224 }
9225 
9226 
9227 /**
9228  * Code for BigInteger::squareToLen() intrinsic
9229  *
9230  * rdi: x
9231  * rsi: len
9232  * r8:  z
9233  * rcx: zlen
9234  * r12: tmp1
9235  * r13: tmp2
9236  * r14: tmp3
9237  * r15: tmp4
9238  * rbx: tmp5
9239  *
9240  */
9241 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
9242 
9243   Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
9244   push(tmp1);
9245   push(tmp2);
9246   push(tmp3);
9247   push(tmp4);
9248   push(tmp5);
9249 
9250   // First loop
9251   // Store the squares, right shifted one bit (i.e., divided by 2).
9252   square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
9253 
9254   // Add in off-diagonal sums.
9255   //
9256   // Second, third (nested) and fourth loops.
9257   // zlen +=2;
9258   // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
9259   //    carry = 0;
9260   //    long op2 = x[xidx:xidx+1];
9261   //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
9262   //       k -= 2;
9263   //       long op1 = x[j:j+1];
9264   //       long sum = z[k:k+1];
9265   //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
9266   //       z[k:k+1] = sum;
9267   //    }
9268   //    add_one_64(z, k, carry, tmp_regs);
9269   // }
9270 
9271   const Register carry = tmp5;
9272   const Register sum = tmp3;
9273   const Register op1 = tmp4;
9274   Register op2 = tmp2;
9275 
9276   push(zlen);
9277   push(len);
9278   addl(zlen,2);
9279   bind(L_second_loop);
9280   xorq(carry, carry);
9281   subl(zlen, 4);
9282   subl(len, 2);
9283   push(zlen);
9284   push(len);
9285   cmpl(len, 0);
9286   jccb(Assembler::lessEqual, L_second_loop_exit);
9287 
9288   // Multiply an array by one 64 bit long.
9289   if (UseBMI2Instructions) {
9290     op2 = rdxReg;
9291     movq(op2, Address(x, len, Address::times_4,  0));
9292     rorxq(op2, op2, 32);
9293   }
9294   else {
9295     movq(op2, Address(x, len, Address::times_4,  0));
9296     rorq(op2, 32);
9297   }
9298 
9299   bind(L_third_loop);
9300   decrementl(len);
9301   jccb(Assembler::negative, L_third_loop_exit);
9302   decrementl(len);
9303   jccb(Assembler::negative, L_last_x);
9304 
9305   movq(op1, Address(x, len, Address::times_4,  0));
9306   rorq(op1, 32);
9307 
9308   bind(L_multiply);
9309   subl(zlen, 2);
9310   movq(sum, Address(z, zlen, Address::times_4,  0));
9311 
9312   // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
9313   if (UseBMI2Instructions) {
9314     multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
9315   }
9316   else {
9317     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
9318   }
9319 
9320   movq(Address(z, zlen, Address::times_4, 0), sum);
9321 
9322   jmp(L_third_loop);
9323   bind(L_third_loop_exit);
9324 
9325   // Fourth loop
9326   // Add 64 bit long carry into z with carry propogation.
9327   // Uses offsetted zlen.
9328   add_one_64(z, zlen, carry, tmp1);
9329 
9330   pop(len);
9331   pop(zlen);
9332   jmp(L_second_loop);
9333 
9334   // Next infrequent code is moved outside loops.
9335   bind(L_last_x);
9336   movl(op1, Address(x, 0));
9337   jmp(L_multiply);
9338 
9339   bind(L_second_loop_exit);
9340   pop(len);
9341   pop(zlen);
9342   pop(len);
9343   pop(zlen);
9344 
9345   // Fifth loop
9346   // Shift z left 1 bit.
9347   lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
9348 
9349   // z[zlen-1] |= x[len-1] & 1;
9350   movl(tmp3, Address(x, len, Address::times_4, -4));
9351   andl(tmp3, 1);
9352   orl(Address(z, zlen, Address::times_4,  -4), tmp3);
9353 
9354   pop(tmp5);
9355   pop(tmp4);
9356   pop(tmp3);
9357   pop(tmp2);
9358   pop(tmp1);
9359 }
9360 
9361 /**
9362  * Helper function for mul_add()
9363  * Multiply the in[] by int k and add to out[] starting at offset offs using
9364  * 128 bit by 32 bit multiply and return the carry in tmp5.
9365  * Only quad int aligned length of in[] is operated on in this function.
9366  * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
9367  * This function preserves out, in and k registers.
9368  * len and offset point to the appropriate index in "in" & "out" correspondingly
9369  * tmp5 has the carry.
9370  * other registers are temporary and are modified.
9371  *
9372  */
9373 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
9374   Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
9375   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
9376 
9377   Label L_first_loop, L_first_loop_exit;
9378 
9379   movl(tmp1, len);
9380   shrl(tmp1, 2);
9381 
9382   bind(L_first_loop);
9383   subl(tmp1, 1);
9384   jccb(Assembler::negative, L_first_loop_exit);
9385 
9386   subl(len, 4);
9387   subl(offset, 4);
9388 
9389   Register op2 = tmp2;
9390   const Register sum = tmp3;
9391   const Register op1 = tmp4;
9392   const Register carry = tmp5;
9393 
9394   if (UseBMI2Instructions) {
9395     op2 = rdxReg;
9396   }
9397 
9398   movq(op1, Address(in, len, Address::times_4,  8));
9399   rorq(op1, 32);
9400   movq(sum, Address(out, offset, Address::times_4,  8));
9401   rorq(sum, 32);
9402   if (UseBMI2Instructions) {
9403     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
9404   }
9405   else {
9406     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
9407   }
9408   // Store back in big endian from little endian
9409   rorq(sum, 0x20);
9410   movq(Address(out, offset, Address::times_4,  8), sum);
9411 
9412   movq(op1, Address(in, len, Address::times_4,  0));
9413   rorq(op1, 32);
9414   movq(sum, Address(out, offset, Address::times_4,  0));
9415   rorq(sum, 32);
9416   if (UseBMI2Instructions) {
9417     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
9418   }
9419   else {
9420     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
9421   }
9422   // Store back in big endian from little endian
9423   rorq(sum, 0x20);
9424   movq(Address(out, offset, Address::times_4,  0), sum);
9425 
9426   jmp(L_first_loop);
9427   bind(L_first_loop_exit);
9428 }
9429 
9430 /**
9431  * Code for BigInteger::mulAdd() intrinsic
9432  *
9433  * rdi: out
9434  * rsi: in
9435  * r11: offs (out.length - offset)
9436  * rcx: len
9437  * r8:  k
9438  * r12: tmp1
9439  * r13: tmp2
9440  * r14: tmp3
9441  * r15: tmp4
9442  * rbx: tmp5
9443  * Multiply the in[] by word k and add to out[], return the carry in rax
9444  */
9445 void MacroAssembler::mul_add(Register out, Register in, Register offs,
9446    Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
9447    Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
9448 
9449   Label L_carry, L_last_in, L_done;
9450 
9451 // carry = 0;
9452 // for (int j=len-1; j >= 0; j--) {
9453 //    long product = (in[j] & LONG_MASK) * kLong +
9454 //                   (out[offs] & LONG_MASK) + carry;
9455 //    out[offs--] = (int)product;
9456 //    carry = product >>> 32;
9457 // }
9458 //
9459   push(tmp1);
9460   push(tmp2);
9461   push(tmp3);
9462   push(tmp4);
9463   push(tmp5);
9464 
9465   Register op2 = tmp2;
9466   const Register sum = tmp3;
9467   const Register op1 = tmp4;
9468   const Register carry =  tmp5;
9469 
9470   if (UseBMI2Instructions) {
9471     op2 = rdxReg;
9472     movl(op2, k);
9473   }
9474   else {
9475     movl(op2, k);
9476   }
9477 
9478   xorq(carry, carry);
9479 
9480   //First loop
9481 
9482   //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
9483   //The carry is in tmp5
9484   mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
9485 
9486   //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
9487   decrementl(len);
9488   jccb(Assembler::negative, L_carry);
9489   decrementl(len);
9490   jccb(Assembler::negative, L_last_in);
9491 
9492   movq(op1, Address(in, len, Address::times_4,  0));
9493   rorq(op1, 32);
9494 
9495   subl(offs, 2);
9496   movq(sum, Address(out, offs, Address::times_4,  0));
9497   rorq(sum, 32);
9498 
9499   if (UseBMI2Instructions) {
9500     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
9501   }
9502   else {
9503     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
9504   }
9505 
9506   // Store back in big endian from little endian
9507   rorq(sum, 0x20);
9508   movq(Address(out, offs, Address::times_4,  0), sum);
9509 
9510   testl(len, len);
9511   jccb(Assembler::zero, L_carry);
9512 
9513   //Multiply the last in[] entry, if any
9514   bind(L_last_in);
9515   movl(op1, Address(in, 0));
9516   movl(sum, Address(out, offs, Address::times_4,  -4));
9517 
9518   movl(raxReg, k);
9519   mull(op1); //tmp4 * eax -> edx:eax
9520   addl(sum, carry);
9521   adcl(rdxReg, 0);
9522   addl(sum, raxReg);
9523   adcl(rdxReg, 0);
9524   movl(carry, rdxReg);
9525 
9526   movl(Address(out, offs, Address::times_4,  -4), sum);
9527 
9528   bind(L_carry);
9529   //return tmp5/carry as carry in rax
9530   movl(rax, carry);
9531 
9532   bind(L_done);
9533   pop(tmp5);
9534   pop(tmp4);
9535   pop(tmp3);
9536   pop(tmp2);
9537   pop(tmp1);
9538 }
9539 #endif
9540 
9541 /**
9542  * Emits code to update CRC-32 with a byte value according to constants in table
9543  *
9544  * @param [in,out]crc   Register containing the crc.
9545  * @param [in]val       Register containing the byte to fold into the CRC.
9546  * @param [in]table     Register containing the table of crc constants.
9547  *
9548  * uint32_t crc;
9549  * val = crc_table[(val ^ crc) & 0xFF];
9550  * crc = val ^ (crc >> 8);
9551  *
9552  */
9553 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
9554   xorl(val, crc);
9555   andl(val, 0xFF);
9556   shrl(crc, 8); // unsigned shift
9557   xorl(crc, Address(table, val, Address::times_4, 0));
9558 }
9559 
9560 /**
9561 * Fold four 128-bit data chunks
9562 */
9563 void MacroAssembler::fold_128bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
9564   evpclmulhdq(xtmp, xK, xcrc, Assembler::AVX_512bit); // [123:64]
9565   evpclmulldq(xcrc, xK, xcrc, Assembler::AVX_512bit); // [63:0]
9566   evpxorq(xcrc, xcrc, Address(buf, offset), Assembler::AVX_512bit /* vector_len */);
9567   evpxorq(xcrc, xcrc, xtmp, Assembler::AVX_512bit /* vector_len */);
9568 }
9569 
9570 /**
9571  * Fold 128-bit data chunk
9572  */
9573 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
9574   if (UseAVX > 0) {
9575     vpclmulhdq(xtmp, xK, xcrc); // [123:64]
9576     vpclmulldq(xcrc, xK, xcrc); // [63:0]
9577     vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
9578     pxor(xcrc, xtmp);
9579   } else {
9580     movdqa(xtmp, xcrc);
9581     pclmulhdq(xtmp, xK);   // [123:64]
9582     pclmulldq(xcrc, xK);   // [63:0]
9583     pxor(xcrc, xtmp);
9584     movdqu(xtmp, Address(buf, offset));
9585     pxor(xcrc, xtmp);
9586   }
9587 }
9588 
9589 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
9590   if (UseAVX > 0) {
9591     vpclmulhdq(xtmp, xK, xcrc);
9592     vpclmulldq(xcrc, xK, xcrc);
9593     pxor(xcrc, xbuf);
9594     pxor(xcrc, xtmp);
9595   } else {
9596     movdqa(xtmp, xcrc);
9597     pclmulhdq(xtmp, xK);
9598     pclmulldq(xcrc, xK);
9599     pxor(xcrc, xbuf);
9600     pxor(xcrc, xtmp);
9601   }
9602 }
9603 
9604 /**
9605  * 8-bit folds to compute 32-bit CRC
9606  *
9607  * uint64_t xcrc;
9608  * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
9609  */
9610 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
9611   movdl(tmp, xcrc);
9612   andl(tmp, 0xFF);
9613   movdl(xtmp, Address(table, tmp, Address::times_4, 0));
9614   psrldq(xcrc, 1); // unsigned shift one byte
9615   pxor(xcrc, xtmp);
9616 }
9617 
9618 /**
9619  * uint32_t crc;
9620  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
9621  */
9622 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
9623   movl(tmp, crc);
9624   andl(tmp, 0xFF);
9625   shrl(crc, 8);
9626   xorl(crc, Address(table, tmp, Address::times_4, 0));
9627 }
9628 
9629 /**
9630  * @param crc   register containing existing CRC (32-bit)
9631  * @param buf   register pointing to input byte buffer (byte*)
9632  * @param len   register containing number of bytes
9633  * @param table register that will contain address of CRC table
9634  * @param tmp   scratch register
9635  */
9636 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
9637   assert_different_registers(crc, buf, len, table, tmp, rax);
9638 
9639   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
9640   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
9641 
9642   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
9643   // context for the registers used, where all instructions below are using 128-bit mode
9644   // On EVEX without VL and BW, these instructions will all be AVX.
9645   lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
9646   notl(crc); // ~crc
9647   cmpl(len, 16);
9648   jcc(Assembler::less, L_tail);
9649 
9650   // Align buffer to 16 bytes
9651   movl(tmp, buf);
9652   andl(tmp, 0xF);
9653   jccb(Assembler::zero, L_aligned);
9654   subl(tmp,  16);
9655   addl(len, tmp);
9656 
9657   align(4);
9658   BIND(L_align_loop);
9659   movsbl(rax, Address(buf, 0)); // load byte with sign extension
9660   update_byte_crc32(crc, rax, table);
9661   increment(buf);
9662   incrementl(tmp);
9663   jccb(Assembler::less, L_align_loop);
9664 
9665   BIND(L_aligned);
9666   movl(tmp, len); // save
9667   shrl(len, 4);
9668   jcc(Assembler::zero, L_tail_restore);
9669 
9670   // Fold total 512 bits of polynomial on each iteration
9671   if (VM_Version::supports_vpclmulqdq()) {
9672     Label Parallel_loop, L_No_Parallel;
9673 
9674     cmpl(len, 8);
9675     jccb(Assembler::less, L_No_Parallel);
9676 
9677     movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
9678     evmovdquq(xmm1, Address(buf, 0), Assembler::AVX_512bit);
9679     movdl(xmm5, crc);
9680     evpxorq(xmm1, xmm1, xmm5, Assembler::AVX_512bit);
9681     addptr(buf, 64);
9682     subl(len, 7);
9683     evshufi64x2(xmm0, xmm0, xmm0, 0x00, Assembler::AVX_512bit); //propagate the mask from 128 bits to 512 bits
9684 
9685     BIND(Parallel_loop);
9686     fold_128bit_crc32_avx512(xmm1, xmm0, xmm5, buf, 0);
9687     addptr(buf, 64);
9688     subl(len, 4);
9689     jcc(Assembler::greater, Parallel_loop);
9690 
9691     vextracti64x2(xmm2, xmm1, 0x01);
9692     vextracti64x2(xmm3, xmm1, 0x02);
9693     vextracti64x2(xmm4, xmm1, 0x03);
9694     jmp(L_fold_512b);
9695 
9696     BIND(L_No_Parallel);
9697   }
9698   // Fold crc into first bytes of vector
9699   movdqa(xmm1, Address(buf, 0));
9700   movdl(rax, xmm1);
9701   xorl(crc, rax);
9702   if (VM_Version::supports_sse4_1()) {
9703     pinsrd(xmm1, crc, 0);
9704   } else {
9705     pinsrw(xmm1, crc, 0);
9706     shrl(crc, 16);
9707     pinsrw(xmm1, crc, 1);
9708   }
9709   addptr(buf, 16);
9710   subl(len, 4); // len > 0
9711   jcc(Assembler::less, L_fold_tail);
9712 
9713   movdqa(xmm2, Address(buf,  0));
9714   movdqa(xmm3, Address(buf, 16));
9715   movdqa(xmm4, Address(buf, 32));
9716   addptr(buf, 48);
9717   subl(len, 3);
9718   jcc(Assembler::lessEqual, L_fold_512b);
9719 
9720   // Fold total 512 bits of polynomial on each iteration,
9721   // 128 bits per each of 4 parallel streams.
9722   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
9723 
9724   align(32);
9725   BIND(L_fold_512b_loop);
9726   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
9727   fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
9728   fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
9729   fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
9730   addptr(buf, 64);
9731   subl(len, 4);
9732   jcc(Assembler::greater, L_fold_512b_loop);
9733 
9734   // Fold 512 bits to 128 bits.
9735   BIND(L_fold_512b);
9736   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
9737   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
9738   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
9739   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
9740 
9741   // Fold the rest of 128 bits data chunks
9742   BIND(L_fold_tail);
9743   addl(len, 3);
9744   jccb(Assembler::lessEqual, L_fold_128b);
9745   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
9746 
9747   BIND(L_fold_tail_loop);
9748   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
9749   addptr(buf, 16);
9750   decrementl(len);
9751   jccb(Assembler::greater, L_fold_tail_loop);
9752 
9753   // Fold 128 bits in xmm1 down into 32 bits in crc register.
9754   BIND(L_fold_128b);
9755   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
9756   if (UseAVX > 0) {
9757     vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
9758     vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
9759     vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
9760   } else {
9761     movdqa(xmm2, xmm0);
9762     pclmulqdq(xmm2, xmm1, 0x1);
9763     movdqa(xmm3, xmm0);
9764     pand(xmm3, xmm2);
9765     pclmulqdq(xmm0, xmm3, 0x1);
9766   }
9767   psrldq(xmm1, 8);
9768   psrldq(xmm2, 4);
9769   pxor(xmm0, xmm1);
9770   pxor(xmm0, xmm2);
9771 
9772   // 8 8-bit folds to compute 32-bit CRC.
9773   for (int j = 0; j < 4; j++) {
9774     fold_8bit_crc32(xmm0, table, xmm1, rax);
9775   }
9776   movdl(crc, xmm0); // mov 32 bits to general register
9777   for (int j = 0; j < 4; j++) {
9778     fold_8bit_crc32(crc, table, rax);
9779   }
9780 
9781   BIND(L_tail_restore);
9782   movl(len, tmp); // restore
9783   BIND(L_tail);
9784   andl(len, 0xf);
9785   jccb(Assembler::zero, L_exit);
9786 
9787   // Fold the rest of bytes
9788   align(4);
9789   BIND(L_tail_loop);
9790   movsbl(rax, Address(buf, 0)); // load byte with sign extension
9791   update_byte_crc32(crc, rax, table);
9792   increment(buf);
9793   decrementl(len);
9794   jccb(Assembler::greater, L_tail_loop);
9795 
9796   BIND(L_exit);
9797   notl(crc); // ~c
9798 }
9799 
9800 #ifdef _LP64
9801 // S. Gueron / Information Processing Letters 112 (2012) 184
9802 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
9803 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
9804 // Output: the 64-bit carry-less product of B * CONST
9805 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
9806                                      Register tmp1, Register tmp2, Register tmp3) {
9807   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
9808   if (n > 0) {
9809     addq(tmp3, n * 256 * 8);
9810   }
9811   //    Q1 = TABLEExt[n][B & 0xFF];
9812   movl(tmp1, in);
9813   andl(tmp1, 0x000000FF);
9814   shll(tmp1, 3);
9815   addq(tmp1, tmp3);
9816   movq(tmp1, Address(tmp1, 0));
9817 
9818   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
9819   movl(tmp2, in);
9820   shrl(tmp2, 8);
9821   andl(tmp2, 0x000000FF);
9822   shll(tmp2, 3);
9823   addq(tmp2, tmp3);
9824   movq(tmp2, Address(tmp2, 0));
9825 
9826   shlq(tmp2, 8);
9827   xorq(tmp1, tmp2);
9828 
9829   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
9830   movl(tmp2, in);
9831   shrl(tmp2, 16);
9832   andl(tmp2, 0x000000FF);
9833   shll(tmp2, 3);
9834   addq(tmp2, tmp3);
9835   movq(tmp2, Address(tmp2, 0));
9836 
9837   shlq(tmp2, 16);
9838   xorq(tmp1, tmp2);
9839 
9840   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
9841   shrl(in, 24);
9842   andl(in, 0x000000FF);
9843   shll(in, 3);
9844   addq(in, tmp3);
9845   movq(in, Address(in, 0));
9846 
9847   shlq(in, 24);
9848   xorq(in, tmp1);
9849   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
9850 }
9851 
9852 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
9853                                       Register in_out,
9854                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
9855                                       XMMRegister w_xtmp2,
9856                                       Register tmp1,
9857                                       Register n_tmp2, Register n_tmp3) {
9858   if (is_pclmulqdq_supported) {
9859     movdl(w_xtmp1, in_out); // modified blindly
9860 
9861     movl(tmp1, const_or_pre_comp_const_index);
9862     movdl(w_xtmp2, tmp1);
9863     pclmulqdq(w_xtmp1, w_xtmp2, 0);
9864 
9865     movdq(in_out, w_xtmp1);
9866   } else {
9867     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
9868   }
9869 }
9870 
9871 // Recombination Alternative 2: No bit-reflections
9872 // T1 = (CRC_A * U1) << 1
9873 // T2 = (CRC_B * U2) << 1
9874 // C1 = T1 >> 32
9875 // C2 = T2 >> 32
9876 // T1 = T1 & 0xFFFFFFFF
9877 // T2 = T2 & 0xFFFFFFFF
9878 // T1 = CRC32(0, T1)
9879 // T2 = CRC32(0, T2)
9880 // C1 = C1 ^ T1
9881 // C2 = C2 ^ T2
9882 // CRC = C1 ^ C2 ^ CRC_C
9883 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
9884                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9885                                      Register tmp1, Register tmp2,
9886                                      Register n_tmp3) {
9887   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9888   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
9889   shlq(in_out, 1);
9890   movl(tmp1, in_out);
9891   shrq(in_out, 32);
9892   xorl(tmp2, tmp2);
9893   crc32(tmp2, tmp1, 4);
9894   xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
9895   shlq(in1, 1);
9896   movl(tmp1, in1);
9897   shrq(in1, 32);
9898   xorl(tmp2, tmp2);
9899   crc32(tmp2, tmp1, 4);
9900   xorl(in1, tmp2);
9901   xorl(in_out, in1);
9902   xorl(in_out, in2);
9903 }
9904 
9905 // Set N to predefined value
9906 // Subtract from a lenght of a buffer
9907 // execute in a loop:
9908 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
9909 // for i = 1 to N do
9910 //  CRC_A = CRC32(CRC_A, A[i])
9911 //  CRC_B = CRC32(CRC_B, B[i])
9912 //  CRC_C = CRC32(CRC_C, C[i])
9913 // end for
9914 // Recombine
9915 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
9916                                        Register in_out1, Register in_out2, Register in_out3,
9917                                        Register tmp1, Register tmp2, Register tmp3,
9918                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
9919                                        Register tmp4, Register tmp5,
9920                                        Register n_tmp6) {
9921   Label L_processPartitions;
9922   Label L_processPartition;
9923   Label L_exit;
9924 
9925   bind(L_processPartitions);
9926   cmpl(in_out1, 3 * size);
9927   jcc(Assembler::less, L_exit);
9928     xorl(tmp1, tmp1);
9929     xorl(tmp2, tmp2);
9930     movq(tmp3, in_out2);
9931     addq(tmp3, size);
9932 
9933     bind(L_processPartition);
9934       crc32(in_out3, Address(in_out2, 0), 8);
9935       crc32(tmp1, Address(in_out2, size), 8);
9936       crc32(tmp2, Address(in_out2, size * 2), 8);
9937       addq(in_out2, 8);
9938       cmpq(in_out2, tmp3);
9939       jcc(Assembler::less, L_processPartition);
9940     crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
9941             w_xtmp1, w_xtmp2, w_xtmp3,
9942             tmp4, tmp5,
9943             n_tmp6);
9944     addq(in_out2, 2 * size);
9945     subl(in_out1, 3 * size);
9946     jmp(L_processPartitions);
9947 
9948   bind(L_exit);
9949 }
9950 #else
9951 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
9952                                      Register tmp1, Register tmp2, Register tmp3,
9953                                      XMMRegister xtmp1, XMMRegister xtmp2) {
9954   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
9955   if (n > 0) {
9956     addl(tmp3, n * 256 * 8);
9957   }
9958   //    Q1 = TABLEExt[n][B & 0xFF];
9959   movl(tmp1, in_out);
9960   andl(tmp1, 0x000000FF);
9961   shll(tmp1, 3);
9962   addl(tmp1, tmp3);
9963   movq(xtmp1, Address(tmp1, 0));
9964 
9965   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
9966   movl(tmp2, in_out);
9967   shrl(tmp2, 8);
9968   andl(tmp2, 0x000000FF);
9969   shll(tmp2, 3);
9970   addl(tmp2, tmp3);
9971   movq(xtmp2, Address(tmp2, 0));
9972 
9973   psllq(xtmp2, 8);
9974   pxor(xtmp1, xtmp2);
9975 
9976   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
9977   movl(tmp2, in_out);
9978   shrl(tmp2, 16);
9979   andl(tmp2, 0x000000FF);
9980   shll(tmp2, 3);
9981   addl(tmp2, tmp3);
9982   movq(xtmp2, Address(tmp2, 0));
9983 
9984   psllq(xtmp2, 16);
9985   pxor(xtmp1, xtmp2);
9986 
9987   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
9988   shrl(in_out, 24);
9989   andl(in_out, 0x000000FF);
9990   shll(in_out, 3);
9991   addl(in_out, tmp3);
9992   movq(xtmp2, Address(in_out, 0));
9993 
9994   psllq(xtmp2, 24);
9995   pxor(xtmp1, xtmp2); // Result in CXMM
9996   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
9997 }
9998 
9999 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
10000                                       Register in_out,
10001                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
10002                                       XMMRegister w_xtmp2,
10003                                       Register tmp1,
10004                                       Register n_tmp2, Register n_tmp3) {
10005   if (is_pclmulqdq_supported) {
10006     movdl(w_xtmp1, in_out);
10007 
10008     movl(tmp1, const_or_pre_comp_const_index);
10009     movdl(w_xtmp2, tmp1);
10010     pclmulqdq(w_xtmp1, w_xtmp2, 0);
10011     // Keep result in XMM since GPR is 32 bit in length
10012   } else {
10013     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
10014   }
10015 }
10016 
10017 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
10018                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10019                                      Register tmp1, Register tmp2,
10020                                      Register n_tmp3) {
10021   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
10022   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
10023 
10024   psllq(w_xtmp1, 1);
10025   movdl(tmp1, w_xtmp1);
10026   psrlq(w_xtmp1, 32);
10027   movdl(in_out, w_xtmp1);
10028 
10029   xorl(tmp2, tmp2);
10030   crc32(tmp2, tmp1, 4);
10031   xorl(in_out, tmp2);
10032 
10033   psllq(w_xtmp2, 1);
10034   movdl(tmp1, w_xtmp2);
10035   psrlq(w_xtmp2, 32);
10036   movdl(in1, w_xtmp2);
10037 
10038   xorl(tmp2, tmp2);
10039   crc32(tmp2, tmp1, 4);
10040   xorl(in1, tmp2);
10041   xorl(in_out, in1);
10042   xorl(in_out, in2);
10043 }
10044 
10045 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
10046                                        Register in_out1, Register in_out2, Register in_out3,
10047                                        Register tmp1, Register tmp2, Register tmp3,
10048                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10049                                        Register tmp4, Register tmp5,
10050                                        Register n_tmp6) {
10051   Label L_processPartitions;
10052   Label L_processPartition;
10053   Label L_exit;
10054 
10055   bind(L_processPartitions);
10056   cmpl(in_out1, 3 * size);
10057   jcc(Assembler::less, L_exit);
10058     xorl(tmp1, tmp1);
10059     xorl(tmp2, tmp2);
10060     movl(tmp3, in_out2);
10061     addl(tmp3, size);
10062 
10063     bind(L_processPartition);
10064       crc32(in_out3, Address(in_out2, 0), 4);
10065       crc32(tmp1, Address(in_out2, size), 4);
10066       crc32(tmp2, Address(in_out2, size*2), 4);
10067       crc32(in_out3, Address(in_out2, 0+4), 4);
10068       crc32(tmp1, Address(in_out2, size+4), 4);
10069       crc32(tmp2, Address(in_out2, size*2+4), 4);
10070       addl(in_out2, 8);
10071       cmpl(in_out2, tmp3);
10072       jcc(Assembler::less, L_processPartition);
10073 
10074         push(tmp3);
10075         push(in_out1);
10076         push(in_out2);
10077         tmp4 = tmp3;
10078         tmp5 = in_out1;
10079         n_tmp6 = in_out2;
10080 
10081       crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
10082             w_xtmp1, w_xtmp2, w_xtmp3,
10083             tmp4, tmp5,
10084             n_tmp6);
10085 
10086         pop(in_out2);
10087         pop(in_out1);
10088         pop(tmp3);
10089 
10090     addl(in_out2, 2 * size);
10091     subl(in_out1, 3 * size);
10092     jmp(L_processPartitions);
10093 
10094   bind(L_exit);
10095 }
10096 #endif //LP64
10097 
10098 #ifdef _LP64
10099 // Algorithm 2: Pipelined usage of the CRC32 instruction.
10100 // Input: A buffer I of L bytes.
10101 // Output: the CRC32C value of the buffer.
10102 // Notations:
10103 // Write L = 24N + r, with N = floor (L/24).
10104 // r = L mod 24 (0 <= r < 24).
10105 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
10106 // N quadwords, and R consists of r bytes.
10107 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
10108 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
10109 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
10110 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
10111 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
10112                                           Register tmp1, Register tmp2, Register tmp3,
10113                                           Register tmp4, Register tmp5, Register tmp6,
10114                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10115                                           bool is_pclmulqdq_supported) {
10116   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
10117   Label L_wordByWord;
10118   Label L_byteByByteProlog;
10119   Label L_byteByByte;
10120   Label L_exit;
10121 
10122   if (is_pclmulqdq_supported ) {
10123     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
10124     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
10125 
10126     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
10127     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
10128 
10129     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
10130     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
10131     assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
10132   } else {
10133     const_or_pre_comp_const_index[0] = 1;
10134     const_or_pre_comp_const_index[1] = 0;
10135 
10136     const_or_pre_comp_const_index[2] = 3;
10137     const_or_pre_comp_const_index[3] = 2;
10138 
10139     const_or_pre_comp_const_index[4] = 5;
10140     const_or_pre_comp_const_index[5] = 4;
10141    }
10142   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
10143                     in2, in1, in_out,
10144                     tmp1, tmp2, tmp3,
10145                     w_xtmp1, w_xtmp2, w_xtmp3,
10146                     tmp4, tmp5,
10147                     tmp6);
10148   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
10149                     in2, in1, in_out,
10150                     tmp1, tmp2, tmp3,
10151                     w_xtmp1, w_xtmp2, w_xtmp3,
10152                     tmp4, tmp5,
10153                     tmp6);
10154   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
10155                     in2, in1, in_out,
10156                     tmp1, tmp2, tmp3,
10157                     w_xtmp1, w_xtmp2, w_xtmp3,
10158                     tmp4, tmp5,
10159                     tmp6);
10160   movl(tmp1, in2);
10161   andl(tmp1, 0x00000007);
10162   negl(tmp1);
10163   addl(tmp1, in2);
10164   addq(tmp1, in1);
10165 
10166   BIND(L_wordByWord);
10167   cmpq(in1, tmp1);
10168   jcc(Assembler::greaterEqual, L_byteByByteProlog);
10169     crc32(in_out, Address(in1, 0), 4);
10170     addq(in1, 4);
10171     jmp(L_wordByWord);
10172 
10173   BIND(L_byteByByteProlog);
10174   andl(in2, 0x00000007);
10175   movl(tmp2, 1);
10176 
10177   BIND(L_byteByByte);
10178   cmpl(tmp2, in2);
10179   jccb(Assembler::greater, L_exit);
10180     crc32(in_out, Address(in1, 0), 1);
10181     incq(in1);
10182     incl(tmp2);
10183     jmp(L_byteByByte);
10184 
10185   BIND(L_exit);
10186 }
10187 #else
10188 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
10189                                           Register tmp1, Register  tmp2, Register tmp3,
10190                                           Register tmp4, Register  tmp5, Register tmp6,
10191                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10192                                           bool is_pclmulqdq_supported) {
10193   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
10194   Label L_wordByWord;
10195   Label L_byteByByteProlog;
10196   Label L_byteByByte;
10197   Label L_exit;
10198 
10199   if (is_pclmulqdq_supported) {
10200     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
10201     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
10202 
10203     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
10204     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
10205 
10206     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
10207     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
10208   } else {
10209     const_or_pre_comp_const_index[0] = 1;
10210     const_or_pre_comp_const_index[1] = 0;
10211 
10212     const_or_pre_comp_const_index[2] = 3;
10213     const_or_pre_comp_const_index[3] = 2;
10214 
10215     const_or_pre_comp_const_index[4] = 5;
10216     const_or_pre_comp_const_index[5] = 4;
10217   }
10218   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
10219                     in2, in1, in_out,
10220                     tmp1, tmp2, tmp3,
10221                     w_xtmp1, w_xtmp2, w_xtmp3,
10222                     tmp4, tmp5,
10223                     tmp6);
10224   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
10225                     in2, in1, in_out,
10226                     tmp1, tmp2, tmp3,
10227                     w_xtmp1, w_xtmp2, w_xtmp3,
10228                     tmp4, tmp5,
10229                     tmp6);
10230   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
10231                     in2, in1, in_out,
10232                     tmp1, tmp2, tmp3,
10233                     w_xtmp1, w_xtmp2, w_xtmp3,
10234                     tmp4, tmp5,
10235                     tmp6);
10236   movl(tmp1, in2);
10237   andl(tmp1, 0x00000007);
10238   negl(tmp1);
10239   addl(tmp1, in2);
10240   addl(tmp1, in1);
10241 
10242   BIND(L_wordByWord);
10243   cmpl(in1, tmp1);
10244   jcc(Assembler::greaterEqual, L_byteByByteProlog);
10245     crc32(in_out, Address(in1,0), 4);
10246     addl(in1, 4);
10247     jmp(L_wordByWord);
10248 
10249   BIND(L_byteByByteProlog);
10250   andl(in2, 0x00000007);
10251   movl(tmp2, 1);
10252 
10253   BIND(L_byteByByte);
10254   cmpl(tmp2, in2);
10255   jccb(Assembler::greater, L_exit);
10256     movb(tmp1, Address(in1, 0));
10257     crc32(in_out, tmp1, 1);
10258     incl(in1);
10259     incl(tmp2);
10260     jmp(L_byteByByte);
10261 
10262   BIND(L_exit);
10263 }
10264 #endif // LP64
10265 #undef BIND
10266 #undef BLOCK_COMMENT
10267 
10268 // Compress char[] array to byte[].
10269 //   ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
10270 //   @HotSpotIntrinsicCandidate
10271 //   private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
10272 //     for (int i = 0; i < len; i++) {
10273 //       int c = src[srcOff++];
10274 //       if (c >>> 8 != 0) {
10275 //         return 0;
10276 //       }
10277 //       dst[dstOff++] = (byte)c;
10278 //     }
10279 //     return len;
10280 //   }
10281 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
10282   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
10283   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
10284   Register tmp5, Register result) {
10285   Label copy_chars_loop, return_length, return_zero, done;
10286 
10287   // rsi: src
10288   // rdi: dst
10289   // rdx: len
10290   // rcx: tmp5
10291   // rax: result
10292 
10293   // rsi holds start addr of source char[] to be compressed
10294   // rdi holds start addr of destination byte[]
10295   // rdx holds length
10296 
10297   assert(len != result, "");
10298 
10299   // save length for return
10300   push(len);
10301 
10302   if ((UseAVX > 2) && // AVX512
10303     VM_Version::supports_avx512vlbw() &&
10304     VM_Version::supports_bmi2()) {
10305 
10306     Label copy_32_loop, copy_loop_tail, below_threshold;
10307 
10308     // alignment
10309     Label post_alignment;
10310 
10311     // if length of the string is less than 16, handle it in an old fashioned way
10312     testl(len, -32);
10313     jcc(Assembler::zero, below_threshold);
10314 
10315     // First check whether a character is compressable ( <= 0xFF).
10316     // Create mask to test for Unicode chars inside zmm vector
10317     movl(result, 0x00FF);
10318     evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
10319 
10320     testl(len, -64);
10321     jcc(Assembler::zero, post_alignment);
10322 
10323     movl(tmp5, dst);
10324     andl(tmp5, (32 - 1));
10325     negl(tmp5);
10326     andl(tmp5, (32 - 1));
10327 
10328     // bail out when there is nothing to be done
10329     testl(tmp5, 0xFFFFFFFF);
10330     jcc(Assembler::zero, post_alignment);
10331 
10332     // ~(~0 << len), where len is the # of remaining elements to process
10333     movl(result, 0xFFFFFFFF);
10334     shlxl(result, result, tmp5);
10335     notl(result);
10336     kmovdl(k3, result);
10337 
10338     evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit);
10339     evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
10340     ktestd(k2, k3);
10341     jcc(Assembler::carryClear, return_zero);
10342 
10343     evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit);
10344 
10345     addptr(src, tmp5);
10346     addptr(src, tmp5);
10347     addptr(dst, tmp5);
10348     subl(len, tmp5);
10349 
10350     bind(post_alignment);
10351     // end of alignment
10352 
10353     movl(tmp5, len);
10354     andl(tmp5, (32 - 1));    // tail count (in chars)
10355     andl(len, ~(32 - 1));    // vector count (in chars)
10356     jcc(Assembler::zero, copy_loop_tail);
10357 
10358     lea(src, Address(src, len, Address::times_2));
10359     lea(dst, Address(dst, len, Address::times_1));
10360     negptr(len);
10361 
10362     bind(copy_32_loop);
10363     evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
10364     evpcmpuw(k2, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
10365     kortestdl(k2, k2);
10366     jcc(Assembler::carryClear, return_zero);
10367 
10368     // All elements in current processed chunk are valid candidates for
10369     // compression. Write a truncated byte elements to the memory.
10370     evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
10371     addptr(len, 32);
10372     jcc(Assembler::notZero, copy_32_loop);
10373 
10374     bind(copy_loop_tail);
10375     // bail out when there is nothing to be done
10376     testl(tmp5, 0xFFFFFFFF);
10377     jcc(Assembler::zero, return_length);
10378 
10379     movl(len, tmp5);
10380 
10381     // ~(~0 << len), where len is the # of remaining elements to process
10382     movl(result, 0xFFFFFFFF);
10383     shlxl(result, result, len);
10384     notl(result);
10385 
10386     kmovdl(k3, result);
10387 
10388     evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit);
10389     evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
10390     ktestd(k2, k3);
10391     jcc(Assembler::carryClear, return_zero);
10392 
10393     evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit);
10394     jmp(return_length);
10395 
10396     bind(below_threshold);
10397   }
10398 
10399   if (UseSSE42Intrinsics) {
10400     Label copy_32_loop, copy_16, copy_tail;
10401 
10402     movl(result, len);
10403 
10404     movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
10405 
10406     // vectored compression
10407     andl(len, 0xfffffff0);    // vector count (in chars)
10408     andl(result, 0x0000000f);    // tail count (in chars)
10409     testl(len, len);
10410     jcc(Assembler::zero, copy_16);
10411 
10412     // compress 16 chars per iter
10413     movdl(tmp1Reg, tmp5);
10414     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
10415     pxor(tmp4Reg, tmp4Reg);
10416 
10417     lea(src, Address(src, len, Address::times_2));
10418     lea(dst, Address(dst, len, Address::times_1));
10419     negptr(len);
10420 
10421     bind(copy_32_loop);
10422     movdqu(tmp2Reg, Address(src, len, Address::times_2));     // load 1st 8 characters
10423     por(tmp4Reg, tmp2Reg);
10424     movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
10425     por(tmp4Reg, tmp3Reg);
10426     ptest(tmp4Reg, tmp1Reg);       // check for Unicode chars in next vector
10427     jcc(Assembler::notZero, return_zero);
10428     packuswb(tmp2Reg, tmp3Reg);    // only ASCII chars; compress each to 1 byte
10429     movdqu(Address(dst, len, Address::times_1), tmp2Reg);
10430     addptr(len, 16);
10431     jcc(Assembler::notZero, copy_32_loop);
10432 
10433     // compress next vector of 8 chars (if any)
10434     bind(copy_16);
10435     movl(len, result);
10436     andl(len, 0xfffffff8);    // vector count (in chars)
10437     andl(result, 0x00000007);    // tail count (in chars)
10438     testl(len, len);
10439     jccb(Assembler::zero, copy_tail);
10440 
10441     movdl(tmp1Reg, tmp5);
10442     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
10443     pxor(tmp3Reg, tmp3Reg);
10444 
10445     movdqu(tmp2Reg, Address(src, 0));
10446     ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in vector
10447     jccb(Assembler::notZero, return_zero);
10448     packuswb(tmp2Reg, tmp3Reg);    // only LATIN1 chars; compress each to 1 byte
10449     movq(Address(dst, 0), tmp2Reg);
10450     addptr(src, 16);
10451     addptr(dst, 8);
10452 
10453     bind(copy_tail);
10454     movl(len, result);
10455   }
10456   // compress 1 char per iter
10457   testl(len, len);
10458   jccb(Assembler::zero, return_length);
10459   lea(src, Address(src, len, Address::times_2));
10460   lea(dst, Address(dst, len, Address::times_1));
10461   negptr(len);
10462 
10463   bind(copy_chars_loop);
10464   load_unsigned_short(result, Address(src, len, Address::times_2));
10465   testl(result, 0xff00);      // check if Unicode char
10466   jccb(Assembler::notZero, return_zero);
10467   movb(Address(dst, len, Address::times_1), result);  // ASCII char; compress to 1 byte
10468   increment(len);
10469   jcc(Assembler::notZero, copy_chars_loop);
10470 
10471   // if compression succeeded, return length
10472   bind(return_length);
10473   pop(result);
10474   jmpb(done);
10475 
10476   // if compression failed, return 0
10477   bind(return_zero);
10478   xorl(result, result);
10479   addptr(rsp, wordSize);
10480 
10481   bind(done);
10482 }
10483 
10484 // Inflate byte[] array to char[].
10485 //   ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
10486 //   @HotSpotIntrinsicCandidate
10487 //   private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
10488 //     for (int i = 0; i < len; i++) {
10489 //       dst[dstOff++] = (char)(src[srcOff++] & 0xff);
10490 //     }
10491 //   }
10492 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
10493   XMMRegister tmp1, Register tmp2) {
10494   Label copy_chars_loop, done, below_threshold;
10495   // rsi: src
10496   // rdi: dst
10497   // rdx: len
10498   // rcx: tmp2
10499 
10500   // rsi holds start addr of source byte[] to be inflated
10501   // rdi holds start addr of destination char[]
10502   // rdx holds length
10503   assert_different_registers(src, dst, len, tmp2);
10504 
10505   if ((UseAVX > 2) && // AVX512
10506     VM_Version::supports_avx512vlbw() &&
10507     VM_Version::supports_bmi2()) {
10508 
10509     Label copy_32_loop, copy_tail;
10510     Register tmp3_aliased = len;
10511 
10512     // if length of the string is less than 16, handle it in an old fashioned way
10513     testl(len, -16);
10514     jcc(Assembler::zero, below_threshold);
10515 
10516     // In order to use only one arithmetic operation for the main loop we use
10517     // this pre-calculation
10518     movl(tmp2, len);
10519     andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
10520     andl(len, -32);     // vector count
10521     jccb(Assembler::zero, copy_tail);
10522 
10523     lea(src, Address(src, len, Address::times_1));
10524     lea(dst, Address(dst, len, Address::times_2));
10525     negptr(len);
10526 
10527 
10528     // inflate 32 chars per iter
10529     bind(copy_32_loop);
10530     vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
10531     evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
10532     addptr(len, 32);
10533     jcc(Assembler::notZero, copy_32_loop);
10534 
10535     bind(copy_tail);
10536     // bail out when there is nothing to be done
10537     testl(tmp2, -1); // we don't destroy the contents of tmp2 here
10538     jcc(Assembler::zero, done);
10539 
10540     // ~(~0 << length), where length is the # of remaining elements to process
10541     movl(tmp3_aliased, -1);
10542     shlxl(tmp3_aliased, tmp3_aliased, tmp2);
10543     notl(tmp3_aliased);
10544     kmovdl(k2, tmp3_aliased);
10545     evpmovzxbw(tmp1, k2, Address(src, 0), Assembler::AVX_512bit);
10546     evmovdquw(Address(dst, 0), k2, tmp1, Assembler::AVX_512bit);
10547 
10548     jmp(done);
10549   }
10550   if (UseSSE42Intrinsics) {
10551     Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
10552 
10553     movl(tmp2, len);
10554 
10555     if (UseAVX > 1) {
10556       andl(tmp2, (16 - 1));
10557       andl(len, -16);
10558       jccb(Assembler::zero, copy_new_tail);
10559     } else {
10560       andl(tmp2, 0x00000007);   // tail count (in chars)
10561       andl(len, 0xfffffff8);    // vector count (in chars)
10562       jccb(Assembler::zero, copy_tail);
10563     }
10564 
10565     // vectored inflation
10566     lea(src, Address(src, len, Address::times_1));
10567     lea(dst, Address(dst, len, Address::times_2));
10568     negptr(len);
10569 
10570     if (UseAVX > 1) {
10571       bind(copy_16_loop);
10572       vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
10573       vmovdqu(Address(dst, len, Address::times_2), tmp1);
10574       addptr(len, 16);
10575       jcc(Assembler::notZero, copy_16_loop);
10576 
10577       bind(below_threshold);
10578       bind(copy_new_tail);
10579       if ((UseAVX > 2) &&
10580         VM_Version::supports_avx512vlbw() &&
10581         VM_Version::supports_bmi2()) {
10582         movl(tmp2, len);
10583       } else {
10584         movl(len, tmp2);
10585       }
10586       andl(tmp2, 0x00000007);
10587       andl(len, 0xFFFFFFF8);
10588       jccb(Assembler::zero, copy_tail);
10589 
10590       pmovzxbw(tmp1, Address(src, 0));
10591       movdqu(Address(dst, 0), tmp1);
10592       addptr(src, 8);
10593       addptr(dst, 2 * 8);
10594 
10595       jmp(copy_tail, true);
10596     }
10597 
10598     // inflate 8 chars per iter
10599     bind(copy_8_loop);
10600     pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
10601     movdqu(Address(dst, len, Address::times_2), tmp1);
10602     addptr(len, 8);
10603     jcc(Assembler::notZero, copy_8_loop);
10604 
10605     bind(copy_tail);
10606     movl(len, tmp2);
10607 
10608     cmpl(len, 4);
10609     jccb(Assembler::less, copy_bytes);
10610 
10611     movdl(tmp1, Address(src, 0));  // load 4 byte chars
10612     pmovzxbw(tmp1, tmp1);
10613     movq(Address(dst, 0), tmp1);
10614     subptr(len, 4);
10615     addptr(src, 4);
10616     addptr(dst, 8);
10617 
10618     bind(copy_bytes);
10619   } else {
10620     bind(below_threshold);
10621   }
10622 
10623   testl(len, len);
10624   jccb(Assembler::zero, done);
10625   lea(src, Address(src, len, Address::times_1));
10626   lea(dst, Address(dst, len, Address::times_2));
10627   negptr(len);
10628 
10629   // inflate 1 char per iter
10630   bind(copy_chars_loop);
10631   load_unsigned_byte(tmp2, Address(src, len, Address::times_1));  // load byte char
10632   movw(Address(dst, len, Address::times_2), tmp2);  // inflate byte char to word
10633   increment(len);
10634   jcc(Assembler::notZero, copy_chars_loop);
10635 
10636   bind(done);
10637 }
10638 
10639 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
10640   switch (cond) {
10641     // Note some conditions are synonyms for others
10642     case Assembler::zero:         return Assembler::notZero;
10643     case Assembler::notZero:      return Assembler::zero;
10644     case Assembler::less:         return Assembler::greaterEqual;
10645     case Assembler::lessEqual:    return Assembler::greater;
10646     case Assembler::greater:      return Assembler::lessEqual;
10647     case Assembler::greaterEqual: return Assembler::less;
10648     case Assembler::below:        return Assembler::aboveEqual;
10649     case Assembler::belowEqual:   return Assembler::above;
10650     case Assembler::above:        return Assembler::belowEqual;
10651     case Assembler::aboveEqual:   return Assembler::below;
10652     case Assembler::overflow:     return Assembler::noOverflow;
10653     case Assembler::noOverflow:   return Assembler::overflow;
10654     case Assembler::negative:     return Assembler::positive;
10655     case Assembler::positive:     return Assembler::negative;
10656     case Assembler::parity:       return Assembler::noParity;
10657     case Assembler::noParity:     return Assembler::parity;
10658   }
10659   ShouldNotReachHere(); return Assembler::overflow;
10660 }
10661 
10662 SkipIfEqual::SkipIfEqual(
10663     MacroAssembler* masm, const bool* flag_addr, bool value) {
10664   _masm = masm;
10665   _masm->cmp8(ExternalAddress((address)flag_addr), value);
10666   _masm->jcc(Assembler::equal, _label);
10667 }
10668 
10669 SkipIfEqual::~SkipIfEqual() {
10670   _masm->bind(_label);
10671 }
10672 
10673 // 32-bit Windows has its own fast-path implementation
10674 // of get_thread
10675 #if !defined(WIN32) || defined(_LP64)
10676 
10677 // This is simply a call to Thread::current()
10678 void MacroAssembler::get_thread(Register thread) {
10679   if (thread != rax) {
10680     push(rax);
10681   }
10682   LP64_ONLY(push(rdi);)
10683   LP64_ONLY(push(rsi);)
10684   push(rdx);
10685   push(rcx);
10686 #ifdef _LP64
10687   push(r8);
10688   push(r9);
10689   push(r10);
10690   push(r11);
10691 #endif
10692 
10693   MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
10694 
10695 #ifdef _LP64
10696   pop(r11);
10697   pop(r10);
10698   pop(r9);
10699   pop(r8);
10700 #endif
10701   pop(rcx);
10702   pop(rdx);
10703   LP64_ONLY(pop(rsi);)
10704   LP64_ONLY(pop(rdi);)
10705   if (thread != rax) {
10706     mov(thread, rax);
10707     pop(rax);
10708   }
10709 }
10710 
10711 #endif