1 /*
   2  * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "jvm.h"
  27 #include "asm/assembler.hpp"
  28 #include "asm/assembler.inline.hpp"
  29 #include "compiler/disassembler.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "gc/shared/collectedHeap.inline.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 #include "memory/resourceArea.hpp"
  35 #include "memory/universe.hpp"
  36 #include "oops/accessDecorators.hpp"
  37 #include "oops/compressedOops.inline.hpp"
  38 #include "oops/klass.inline.hpp"
  39 #include "prims/methodHandles.hpp"
  40 #include "runtime/biasedLocking.hpp"
  41 #include "runtime/flags/flagSetting.hpp"
  42 #include "runtime/interfaceSupport.inline.hpp"
  43 #include "runtime/objectMonitor.hpp"
  44 #include "runtime/os.hpp"
  45 #include "runtime/safepoint.hpp"
  46 #include "runtime/safepointMechanism.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/stubRoutines.hpp"
  49 #include "runtime/thread.hpp"
  50 #include "utilities/macros.hpp"
  51 #include "crc32c.h"
  52 
  53 #ifdef PRODUCT
  54 #define BLOCK_COMMENT(str) /* nothing */
  55 #define STOP(error) stop(error)
  56 #else
  57 #define BLOCK_COMMENT(str) block_comment(str)
  58 #define STOP(error) block_comment(error); stop(error)
  59 #endif
  60 
  61 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  62 
  63 #ifdef ASSERT
  64 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
  65 #endif
  66 
  67 static Assembler::Condition reverse[] = {
  68     Assembler::noOverflow     /* overflow      = 0x0 */ ,
  69     Assembler::overflow       /* noOverflow    = 0x1 */ ,
  70     Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
  71     Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
  72     Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
  73     Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
  74     Assembler::above          /* belowEqual    = 0x6 */ ,
  75     Assembler::belowEqual     /* above         = 0x7 */ ,
  76     Assembler::positive       /* negative      = 0x8 */ ,
  77     Assembler::negative       /* positive      = 0x9 */ ,
  78     Assembler::noParity       /* parity        = 0xa */ ,
  79     Assembler::parity         /* noParity      = 0xb */ ,
  80     Assembler::greaterEqual   /* less          = 0xc */ ,
  81     Assembler::less           /* greaterEqual  = 0xd */ ,
  82     Assembler::greater        /* lessEqual     = 0xe */ ,
  83     Assembler::lessEqual      /* greater       = 0xf, */
  84 
  85 };
  86 
  87 
  88 // Implementation of MacroAssembler
  89 
  90 // First all the versions that have distinct versions depending on 32/64 bit
  91 // Unless the difference is trivial (1 line or so).
  92 
  93 #ifndef _LP64
  94 
  95 // 32bit versions
  96 
  97 Address MacroAssembler::as_Address(AddressLiteral adr) {
  98   return Address(adr.target(), adr.rspec());
  99 }
 100 
 101 Address MacroAssembler::as_Address(ArrayAddress adr) {
 102   return Address::make_array(adr);
 103 }
 104 
 105 void MacroAssembler::call_VM_leaf_base(address entry_point,
 106                                        int number_of_arguments) {
 107   call(RuntimeAddress(entry_point));
 108   increment(rsp, number_of_arguments * wordSize);
 109 }
 110 
 111 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
 112   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 113 }
 114 
 115 
 116 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
 117   cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 118 }
 119 
 120 void MacroAssembler::cmpoop_raw(Address src1, jobject obj) {
 121   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 122 }
 123 
 124 void MacroAssembler::cmpoop_raw(Register src1, jobject obj) {
 125   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
 126 }
 127 
 128 void MacroAssembler::cmpoop(Address src1, jobject obj) {
 129   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 130   bs->obj_equals(this, src1, obj);
 131 }
 132 
 133 void MacroAssembler::cmpoop(Register src1, jobject obj) {
 134   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
 135   bs->obj_equals(this, src1, obj);
 136 }
 137 
 138 void MacroAssembler::extend_sign(Register hi, Register lo) {
 139   // According to Intel Doc. AP-526, "Integer Divide", p.18.
 140   if (VM_Version::is_P6() && hi == rdx && lo == rax) {
 141     cdql();
 142   } else {
 143     movl(hi, lo);
 144     sarl(hi, 31);
 145   }
 146 }
 147 
 148 void MacroAssembler::jC2(Register tmp, Label& L) {
 149   // set parity bit if FPU flag C2 is set (via rax)
 150   save_rax(tmp);
 151   fwait(); fnstsw_ax();
 152   sahf();
 153   restore_rax(tmp);
 154   // branch
 155   jcc(Assembler::parity, L);
 156 }
 157 
 158 void MacroAssembler::jnC2(Register tmp, Label& L) {
 159   // set parity bit if FPU flag C2 is set (via rax)
 160   save_rax(tmp);
 161   fwait(); fnstsw_ax();
 162   sahf();
 163   restore_rax(tmp);
 164   // branch
 165   jcc(Assembler::noParity, L);
 166 }
 167 
 168 // 32bit can do a case table jump in one instruction but we no longer allow the base
 169 // to be installed in the Address class
 170 void MacroAssembler::jump(ArrayAddress entry) {
 171   jmp(as_Address(entry));
 172 }
 173 
 174 // Note: y_lo will be destroyed
 175 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 176   // Long compare for Java (semantics as described in JVM spec.)
 177   Label high, low, done;
 178 
 179   cmpl(x_hi, y_hi);
 180   jcc(Assembler::less, low);
 181   jcc(Assembler::greater, high);
 182   // x_hi is the return register
 183   xorl(x_hi, x_hi);
 184   cmpl(x_lo, y_lo);
 185   jcc(Assembler::below, low);
 186   jcc(Assembler::equal, done);
 187 
 188   bind(high);
 189   xorl(x_hi, x_hi);
 190   increment(x_hi);
 191   jmp(done);
 192 
 193   bind(low);
 194   xorl(x_hi, x_hi);
 195   decrementl(x_hi);
 196 
 197   bind(done);
 198 }
 199 
 200 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 201     mov_literal32(dst, (int32_t)src.target(), src.rspec());
 202 }
 203 
 204 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 205   // leal(dst, as_Address(adr));
 206   // see note in movl as to why we must use a move
 207   mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
 208 }
 209 
 210 void MacroAssembler::leave() {
 211   mov(rsp, rbp);
 212   pop(rbp);
 213 }
 214 
 215 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
 216   // Multiplication of two Java long values stored on the stack
 217   // as illustrated below. Result is in rdx:rax.
 218   //
 219   // rsp ---> [  ??  ] \               \
 220   //            ....    | y_rsp_offset  |
 221   //          [ y_lo ] /  (in bytes)    | x_rsp_offset
 222   //          [ y_hi ]                  | (in bytes)
 223   //            ....                    |
 224   //          [ x_lo ]                 /
 225   //          [ x_hi ]
 226   //            ....
 227   //
 228   // Basic idea: lo(result) = lo(x_lo * y_lo)
 229   //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
 230   Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
 231   Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
 232   Label quick;
 233   // load x_hi, y_hi and check if quick
 234   // multiplication is possible
 235   movl(rbx, x_hi);
 236   movl(rcx, y_hi);
 237   movl(rax, rbx);
 238   orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
 239   jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
 240   // do full multiplication
 241   // 1st step
 242   mull(y_lo);                                    // x_hi * y_lo
 243   movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
 244   // 2nd step
 245   movl(rax, x_lo);
 246   mull(rcx);                                     // x_lo * y_hi
 247   addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
 248   // 3rd step
 249   bind(quick);                                   // note: rbx, = 0 if quick multiply!
 250   movl(rax, x_lo);
 251   mull(y_lo);                                    // x_lo * y_lo
 252   addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
 253 }
 254 
 255 void MacroAssembler::lneg(Register hi, Register lo) {
 256   negl(lo);
 257   adcl(hi, 0);
 258   negl(hi);
 259 }
 260 
 261 void MacroAssembler::lshl(Register hi, Register lo) {
 262   // Java shift left long support (semantics as described in JVM spec., p.305)
 263   // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
 264   // shift value is in rcx !
 265   assert(hi != rcx, "must not use rcx");
 266   assert(lo != rcx, "must not use rcx");
 267   const Register s = rcx;                        // shift count
 268   const int      n = BitsPerWord;
 269   Label L;
 270   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 271   cmpl(s, n);                                    // if (s < n)
 272   jcc(Assembler::less, L);                       // else (s >= n)
 273   movl(hi, lo);                                  // x := x << n
 274   xorl(lo, lo);
 275   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 276   bind(L);                                       // s (mod n) < n
 277   shldl(hi, lo);                                 // x := x << s
 278   shll(lo);
 279 }
 280 
 281 
 282 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
 283   // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
 284   // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
 285   assert(hi != rcx, "must not use rcx");
 286   assert(lo != rcx, "must not use rcx");
 287   const Register s = rcx;                        // shift count
 288   const int      n = BitsPerWord;
 289   Label L;
 290   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
 291   cmpl(s, n);                                    // if (s < n)
 292   jcc(Assembler::less, L);                       // else (s >= n)
 293   movl(lo, hi);                                  // x := x >> n
 294   if (sign_extension) sarl(hi, 31);
 295   else                xorl(hi, hi);
 296   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
 297   bind(L);                                       // s (mod n) < n
 298   shrdl(lo, hi);                                 // x := x >> s
 299   if (sign_extension) sarl(hi);
 300   else                shrl(hi);
 301 }
 302 
 303 void MacroAssembler::movoop(Register dst, jobject obj) {
 304   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 305 }
 306 
 307 void MacroAssembler::movoop(Address dst, jobject obj) {
 308   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
 309 }
 310 
 311 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 312   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 313 }
 314 
 315 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 316   mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
 317 }
 318 
 319 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 320   // scratch register is not used,
 321   // it is defined to match parameters of 64-bit version of this method.
 322   if (src.is_lval()) {
 323     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
 324   } else {
 325     movl(dst, as_Address(src));
 326   }
 327 }
 328 
 329 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 330   movl(as_Address(dst), src);
 331 }
 332 
 333 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 334   movl(dst, as_Address(src));
 335 }
 336 
 337 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 338 void MacroAssembler::movptr(Address dst, intptr_t src) {
 339   movl(dst, src);
 340 }
 341 
 342 
 343 void MacroAssembler::pop_callee_saved_registers() {
 344   pop(rcx);
 345   pop(rdx);
 346   pop(rdi);
 347   pop(rsi);
 348 }
 349 
 350 void MacroAssembler::push_callee_saved_registers() {
 351   push(rsi);
 352   push(rdi);
 353   push(rdx);
 354   push(rcx);
 355 }
 356 
 357 void MacroAssembler::pushoop(jobject obj) {
 358   push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
 359 }
 360 
 361 void MacroAssembler::pushklass(Metadata* obj) {
 362   push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
 363 }
 364 
 365 void MacroAssembler::pushptr(AddressLiteral src) {
 366   if (src.is_lval()) {
 367     push_literal32((int32_t)src.target(), src.rspec());
 368   } else {
 369     pushl(as_Address(src));
 370   }
 371 }
 372 
 373 void MacroAssembler::set_word_if_not_zero(Register dst) {
 374   xorl(dst, dst);
 375   set_byte_if_not_zero(dst);
 376 }
 377 
 378 static void pass_arg0(MacroAssembler* masm, Register arg) {
 379   masm->push(arg);
 380 }
 381 
 382 static void pass_arg1(MacroAssembler* masm, Register arg) {
 383   masm->push(arg);
 384 }
 385 
 386 static void pass_arg2(MacroAssembler* masm, Register arg) {
 387   masm->push(arg);
 388 }
 389 
 390 static void pass_arg3(MacroAssembler* masm, Register arg) {
 391   masm->push(arg);
 392 }
 393 
 394 #ifndef PRODUCT
 395 extern "C" void findpc(intptr_t x);
 396 #endif
 397 
 398 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
 399   // In order to get locks to work, we need to fake a in_VM state
 400   JavaThread* thread = JavaThread::current();
 401   JavaThreadState saved_state = thread->thread_state();
 402   thread->set_thread_state(_thread_in_vm);
 403   if (ShowMessageBoxOnError) {
 404     JavaThread* thread = JavaThread::current();
 405     JavaThreadState saved_state = thread->thread_state();
 406     thread->set_thread_state(_thread_in_vm);
 407     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 408       ttyLocker ttyl;
 409       BytecodeCounter::print();
 410     }
 411     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 412     // This is the value of eip which points to where verify_oop will return.
 413     if (os::message_box(msg, "Execution stopped, print registers?")) {
 414       print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
 415       BREAKPOINT;
 416     }
 417   }
 418   fatal("DEBUG MESSAGE: %s", msg);
 419 }
 420 
 421 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
 422   ttyLocker ttyl;
 423   FlagSetting fs(Debugging, true);
 424   tty->print_cr("eip = 0x%08x", eip);
 425 #ifndef PRODUCT
 426   if ((WizardMode || Verbose) && PrintMiscellaneous) {
 427     tty->cr();
 428     findpc(eip);
 429     tty->cr();
 430   }
 431 #endif
 432 #define PRINT_REG(rax) \
 433   { tty->print("%s = ", #rax); os::print_location(tty, rax); }
 434   PRINT_REG(rax);
 435   PRINT_REG(rbx);
 436   PRINT_REG(rcx);
 437   PRINT_REG(rdx);
 438   PRINT_REG(rdi);
 439   PRINT_REG(rsi);
 440   PRINT_REG(rbp);
 441   PRINT_REG(rsp);
 442 #undef PRINT_REG
 443   // Print some words near top of staack.
 444   int* dump_sp = (int*) rsp;
 445   for (int col1 = 0; col1 < 8; col1++) {
 446     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 447     os::print_location(tty, *dump_sp++);
 448   }
 449   for (int row = 0; row < 16; row++) {
 450     tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 451     for (int col = 0; col < 8; col++) {
 452       tty->print(" 0x%08x", *dump_sp++);
 453     }
 454     tty->cr();
 455   }
 456   // Print some instructions around pc:
 457   Disassembler::decode((address)eip-64, (address)eip);
 458   tty->print_cr("--------");
 459   Disassembler::decode((address)eip, (address)eip+32);
 460 }
 461 
 462 void MacroAssembler::stop(const char* msg) {
 463   ExternalAddress message((address)msg);
 464   // push address of message
 465   pushptr(message.addr());
 466   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 467   pusha();                                            // push registers
 468   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
 469   hlt();
 470 }
 471 
 472 void MacroAssembler::warn(const char* msg) {
 473   push_CPU_state();
 474 
 475   ExternalAddress message((address) msg);
 476   // push address of message
 477   pushptr(message.addr());
 478 
 479   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
 480   addl(rsp, wordSize);       // discard argument
 481   pop_CPU_state();
 482 }
 483 
 484 void MacroAssembler::print_state() {
 485   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
 486   pusha();                                            // push registers
 487 
 488   push_CPU_state();
 489   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
 490   pop_CPU_state();
 491 
 492   popa();
 493   addl(rsp, wordSize);
 494 }
 495 
 496 #else // _LP64
 497 
 498 // 64 bit versions
 499 
 500 Address MacroAssembler::as_Address(AddressLiteral adr) {
 501   // amd64 always does this as a pc-rel
 502   // we can be absolute or disp based on the instruction type
 503   // jmp/call are displacements others are absolute
 504   assert(!adr.is_lval(), "must be rval");
 505   assert(reachable(adr), "must be");
 506   return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
 507 
 508 }
 509 
 510 Address MacroAssembler::as_Address(ArrayAddress adr) {
 511   AddressLiteral base = adr.base();
 512   lea(rscratch1, base);
 513   Address index = adr.index();
 514   assert(index._disp == 0, "must not have disp"); // maybe it can?
 515   Address array(rscratch1, index._index, index._scale, index._disp);
 516   return array;
 517 }
 518 
 519 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
 520   Label L, E;
 521 
 522 #ifdef _WIN64
 523   // Windows always allocates space for it's register args
 524   assert(num_args <= 4, "only register arguments supported");
 525   subq(rsp,  frame::arg_reg_save_area_bytes);
 526 #endif
 527 
 528   // Align stack if necessary
 529   testl(rsp, 15);
 530   jcc(Assembler::zero, L);
 531 
 532   subq(rsp, 8);
 533   {
 534     call(RuntimeAddress(entry_point));
 535   }
 536   addq(rsp, 8);
 537   jmp(E);
 538 
 539   bind(L);
 540   {
 541     call(RuntimeAddress(entry_point));
 542   }
 543 
 544   bind(E);
 545 
 546 #ifdef _WIN64
 547   // restore stack pointer
 548   addq(rsp, frame::arg_reg_save_area_bytes);
 549 #endif
 550 
 551 }
 552 
 553 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
 554   assert(!src2.is_lval(), "should use cmpptr");
 555 
 556   if (reachable(src2)) {
 557     cmpq(src1, as_Address(src2));
 558   } else {
 559     lea(rscratch1, src2);
 560     Assembler::cmpq(src1, Address(rscratch1, 0));
 561   }
 562 }
 563 
 564 int MacroAssembler::corrected_idivq(Register reg) {
 565   // Full implementation of Java ldiv and lrem; checks for special
 566   // case as described in JVM spec., p.243 & p.271.  The function
 567   // returns the (pc) offset of the idivl instruction - may be needed
 568   // for implicit exceptions.
 569   //
 570   //         normal case                           special case
 571   //
 572   // input : rax: dividend                         min_long
 573   //         reg: divisor   (may not be eax/edx)   -1
 574   //
 575   // output: rax: quotient  (= rax idiv reg)       min_long
 576   //         rdx: remainder (= rax irem reg)       0
 577   assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
 578   static const int64_t min_long = 0x8000000000000000;
 579   Label normal_case, special_case;
 580 
 581   // check for special case
 582   cmp64(rax, ExternalAddress((address) &min_long));
 583   jcc(Assembler::notEqual, normal_case);
 584   xorl(rdx, rdx); // prepare rdx for possible special case (where
 585                   // remainder = 0)
 586   cmpq(reg, -1);
 587   jcc(Assembler::equal, special_case);
 588 
 589   // handle normal case
 590   bind(normal_case);
 591   cdqq();
 592   int idivq_offset = offset();
 593   idivq(reg);
 594 
 595   // normal and special case exit
 596   bind(special_case);
 597 
 598   return idivq_offset;
 599 }
 600 
 601 void MacroAssembler::decrementq(Register reg, int value) {
 602   if (value == min_jint) { subq(reg, value); return; }
 603   if (value <  0) { incrementq(reg, -value); return; }
 604   if (value == 0) {                        ; return; }
 605   if (value == 1 && UseIncDec) { decq(reg) ; return; }
 606   /* else */      { subq(reg, value)       ; return; }
 607 }
 608 
 609 void MacroAssembler::decrementq(Address dst, int value) {
 610   if (value == min_jint) { subq(dst, value); return; }
 611   if (value <  0) { incrementq(dst, -value); return; }
 612   if (value == 0) {                        ; return; }
 613   if (value == 1 && UseIncDec) { decq(dst) ; return; }
 614   /* else */      { subq(dst, value)       ; return; }
 615 }
 616 
 617 void MacroAssembler::incrementq(AddressLiteral dst) {
 618   if (reachable(dst)) {
 619     incrementq(as_Address(dst));
 620   } else {
 621     lea(rscratch1, dst);
 622     incrementq(Address(rscratch1, 0));
 623   }
 624 }
 625 
 626 void MacroAssembler::incrementq(Register reg, int value) {
 627   if (value == min_jint) { addq(reg, value); return; }
 628   if (value <  0) { decrementq(reg, -value); return; }
 629   if (value == 0) {                        ; return; }
 630   if (value == 1 && UseIncDec) { incq(reg) ; return; }
 631   /* else */      { addq(reg, value)       ; return; }
 632 }
 633 
 634 void MacroAssembler::incrementq(Address dst, int value) {
 635   if (value == min_jint) { addq(dst, value); return; }
 636   if (value <  0) { decrementq(dst, -value); return; }
 637   if (value == 0) {                        ; return; }
 638   if (value == 1 && UseIncDec) { incq(dst) ; return; }
 639   /* else */      { addq(dst, value)       ; return; }
 640 }
 641 
 642 // 32bit can do a case table jump in one instruction but we no longer allow the base
 643 // to be installed in the Address class
 644 void MacroAssembler::jump(ArrayAddress entry) {
 645   lea(rscratch1, entry.base());
 646   Address dispatch = entry.index();
 647   assert(dispatch._base == noreg, "must be");
 648   dispatch._base = rscratch1;
 649   jmp(dispatch);
 650 }
 651 
 652 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
 653   ShouldNotReachHere(); // 64bit doesn't use two regs
 654   cmpq(x_lo, y_lo);
 655 }
 656 
 657 void MacroAssembler::lea(Register dst, AddressLiteral src) {
 658     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 659 }
 660 
 661 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
 662   mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
 663   movptr(dst, rscratch1);
 664 }
 665 
 666 void MacroAssembler::leave() {
 667   // %%% is this really better? Why not on 32bit too?
 668   emit_int8((unsigned char)0xC9); // LEAVE
 669 }
 670 
 671 void MacroAssembler::lneg(Register hi, Register lo) {
 672   ShouldNotReachHere(); // 64bit doesn't use two regs
 673   negq(lo);
 674 }
 675 
 676 void MacroAssembler::movoop(Register dst, jobject obj) {
 677   mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 678 }
 679 
 680 void MacroAssembler::movoop(Address dst, jobject obj) {
 681   mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
 682   movq(dst, rscratch1);
 683 }
 684 
 685 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
 686   mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 687 }
 688 
 689 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
 690   mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
 691   movq(dst, rscratch1);
 692 }
 693 
 694 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
 695   if (src.is_lval()) {
 696     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
 697   } else {
 698     if (reachable(src)) {
 699       movq(dst, as_Address(src));
 700     } else {
 701       lea(scratch, src);
 702       movq(dst, Address(scratch, 0));
 703     }
 704   }
 705 }
 706 
 707 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
 708   movq(as_Address(dst), src);
 709 }
 710 
 711 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
 712   movq(dst, as_Address(src));
 713 }
 714 
 715 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
 716 void MacroAssembler::movptr(Address dst, intptr_t src) {
 717   mov64(rscratch1, src);
 718   movq(dst, rscratch1);
 719 }
 720 
 721 // These are mostly for initializing NULL
 722 void MacroAssembler::movptr(Address dst, int32_t src) {
 723   movslq(dst, src);
 724 }
 725 
 726 void MacroAssembler::movptr(Register dst, int32_t src) {
 727   mov64(dst, (intptr_t)src);
 728 }
 729 
 730 void MacroAssembler::pushoop(jobject obj) {
 731   movoop(rscratch1, obj);
 732   push(rscratch1);
 733 }
 734 
 735 void MacroAssembler::pushklass(Metadata* obj) {
 736   mov_metadata(rscratch1, obj);
 737   push(rscratch1);
 738 }
 739 
 740 void MacroAssembler::pushptr(AddressLiteral src) {
 741   lea(rscratch1, src);
 742   if (src.is_lval()) {
 743     push(rscratch1);
 744   } else {
 745     pushq(Address(rscratch1, 0));
 746   }
 747 }
 748 
 749 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 750   // we must set sp to zero to clear frame
 751   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
 752   // must clear fp, so that compiled frames are not confused; it is
 753   // possible that we need it only for debugging
 754   if (clear_fp) {
 755     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
 756   }
 757 
 758   // Always clear the pc because it could have been set by make_walkable()
 759   movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
 760   vzeroupper();
 761 }
 762 
 763 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 764                                          Register last_java_fp,
 765                                          address  last_java_pc) {
 766   vzeroupper();
 767   // determine last_java_sp register
 768   if (!last_java_sp->is_valid()) {
 769     last_java_sp = rsp;
 770   }
 771 
 772   // last_java_fp is optional
 773   if (last_java_fp->is_valid()) {
 774     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
 775            last_java_fp);
 776   }
 777 
 778   // last_java_pc is optional
 779   if (last_java_pc != NULL) {
 780     Address java_pc(r15_thread,
 781                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
 782     lea(rscratch1, InternalAddress(last_java_pc));
 783     movptr(java_pc, rscratch1);
 784   }
 785 
 786   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
 787 }
 788 
 789 static void pass_arg0(MacroAssembler* masm, Register arg) {
 790   if (c_rarg0 != arg ) {
 791     masm->mov(c_rarg0, arg);
 792   }
 793 }
 794 
 795 static void pass_arg1(MacroAssembler* masm, Register arg) {
 796   if (c_rarg1 != arg ) {
 797     masm->mov(c_rarg1, arg);
 798   }
 799 }
 800 
 801 static void pass_arg2(MacroAssembler* masm, Register arg) {
 802   if (c_rarg2 != arg ) {
 803     masm->mov(c_rarg2, arg);
 804   }
 805 }
 806 
 807 static void pass_arg3(MacroAssembler* masm, Register arg) {
 808   if (c_rarg3 != arg ) {
 809     masm->mov(c_rarg3, arg);
 810   }
 811 }
 812 
 813 void MacroAssembler::stop(const char* msg) {
 814   if (ShowMessageBoxOnError) {
 815     address rip = pc();
 816     pusha(); // get regs on stack
 817     lea(c_rarg1, InternalAddress(rip));
 818     movq(c_rarg2, rsp); // pass pointer to regs array
 819   }
 820   lea(c_rarg0, ExternalAddress((address) msg));
 821   andq(rsp, -16); // align stack as required by ABI
 822   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
 823   hlt();
 824 }
 825 
 826 void MacroAssembler::warn(const char* msg) {
 827   push(rbp);
 828   movq(rbp, rsp);
 829   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 830   push_CPU_state();   // keeps alignment at 16 bytes
 831   lea(c_rarg0, ExternalAddress((address) msg));
 832   lea(rax, ExternalAddress(CAST_FROM_FN_PTR(address, warning)));
 833   call(rax);
 834   pop_CPU_state();
 835   mov(rsp, rbp);
 836   pop(rbp);
 837 }
 838 
 839 void MacroAssembler::print_state() {
 840   address rip = pc();
 841   pusha();            // get regs on stack
 842   push(rbp);
 843   movq(rbp, rsp);
 844   andq(rsp, -16);     // align stack as required by push_CPU_state and call
 845   push_CPU_state();   // keeps alignment at 16 bytes
 846 
 847   lea(c_rarg0, InternalAddress(rip));
 848   lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
 849   call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
 850 
 851   pop_CPU_state();
 852   mov(rsp, rbp);
 853   pop(rbp);
 854   popa();
 855 }
 856 
 857 #ifndef PRODUCT
 858 extern "C" void findpc(intptr_t x);
 859 #endif
 860 
 861 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
 862   // In order to get locks to work, we need to fake a in_VM state
 863   if (ShowMessageBoxOnError) {
 864     JavaThread* thread = JavaThread::current();
 865     JavaThreadState saved_state = thread->thread_state();
 866     thread->set_thread_state(_thread_in_vm);
 867 #ifndef PRODUCT
 868     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
 869       ttyLocker ttyl;
 870       BytecodeCounter::print();
 871     }
 872 #endif
 873     // To see where a verify_oop failed, get $ebx+40/X for this frame.
 874     // XXX correct this offset for amd64
 875     // This is the value of eip which points to where verify_oop will return.
 876     if (os::message_box(msg, "Execution stopped, print registers?")) {
 877       print_state64(pc, regs);
 878       BREAKPOINT;
 879     }
 880   }
 881   fatal("DEBUG MESSAGE: %s", msg);
 882 }
 883 
 884 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
 885   ttyLocker ttyl;
 886   FlagSetting fs(Debugging, true);
 887   tty->print_cr("rip = 0x%016lx", (intptr_t)pc);
 888 #ifndef PRODUCT
 889   tty->cr();
 890   findpc(pc);
 891   tty->cr();
 892 #endif
 893 #define PRINT_REG(rax, value) \
 894   { tty->print("%s = ", #rax); os::print_location(tty, value); }
 895   PRINT_REG(rax, regs[15]);
 896   PRINT_REG(rbx, regs[12]);
 897   PRINT_REG(rcx, regs[14]);
 898   PRINT_REG(rdx, regs[13]);
 899   PRINT_REG(rdi, regs[8]);
 900   PRINT_REG(rsi, regs[9]);
 901   PRINT_REG(rbp, regs[10]);
 902   // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp
 903   PRINT_REG(rsp, (intptr_t)(&regs[16]));
 904   PRINT_REG(r8 , regs[7]);
 905   PRINT_REG(r9 , regs[6]);
 906   PRINT_REG(r10, regs[5]);
 907   PRINT_REG(r11, regs[4]);
 908   PRINT_REG(r12, regs[3]);
 909   PRINT_REG(r13, regs[2]);
 910   PRINT_REG(r14, regs[1]);
 911   PRINT_REG(r15, regs[0]);
 912 #undef PRINT_REG
 913   // Print some words near the top of the stack.
 914   int64_t* rsp = &regs[16];
 915   int64_t* dump_sp = rsp;
 916   for (int col1 = 0; col1 < 8; col1++) {
 917     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 918     os::print_location(tty, *dump_sp++);
 919   }
 920   for (int row = 0; row < 25; row++) {
 921     tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
 922     for (int col = 0; col < 4; col++) {
 923       tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
 924     }
 925     tty->cr();
 926   }
 927   // Print some instructions around pc:
 928   Disassembler::decode((address)pc-64, (address)pc);
 929   tty->print_cr("--------");
 930   Disassembler::decode((address)pc, (address)pc+32);
 931 }
 932 
 933 #endif // _LP64
 934 
 935 // Now versions that are common to 32/64 bit
 936 
 937 void MacroAssembler::addptr(Register dst, int32_t imm32) {
 938   LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
 939 }
 940 
 941 void MacroAssembler::addptr(Register dst, Register src) {
 942   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
 943 }
 944 
 945 void MacroAssembler::addptr(Address dst, Register src) {
 946   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
 947 }
 948 
 949 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
 950   if (reachable(src)) {
 951     Assembler::addsd(dst, as_Address(src));
 952   } else {
 953     lea(rscratch1, src);
 954     Assembler::addsd(dst, Address(rscratch1, 0));
 955   }
 956 }
 957 
 958 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
 959   if (reachable(src)) {
 960     addss(dst, as_Address(src));
 961   } else {
 962     lea(rscratch1, src);
 963     addss(dst, Address(rscratch1, 0));
 964   }
 965 }
 966 
 967 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src) {
 968   if (reachable(src)) {
 969     Assembler::addpd(dst, as_Address(src));
 970   } else {
 971     lea(rscratch1, src);
 972     Assembler::addpd(dst, Address(rscratch1, 0));
 973   }
 974 }
 975 
 976 void MacroAssembler::align(int modulus) {
 977   align(modulus, offset());
 978 }
 979 
 980 void MacroAssembler::align(int modulus, int target) {
 981   if (target % modulus != 0) {
 982     nop(modulus - (target % modulus));
 983   }
 984 }
 985 
 986 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
 987   // Used in sign-masking with aligned address.
 988   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
 989   if (reachable(src)) {
 990     Assembler::andpd(dst, as_Address(src));
 991   } else {
 992     lea(scratch_reg, src);
 993     Assembler::andpd(dst, Address(scratch_reg, 0));
 994   }
 995 }
 996 
 997 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
 998   // Used in sign-masking with aligned address.
 999   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1000   if (reachable(src)) {
1001     Assembler::andps(dst, as_Address(src));
1002   } else {
1003     lea(scratch_reg, src);
1004     Assembler::andps(dst, Address(scratch_reg, 0));
1005   }
1006 }
1007 
1008 void MacroAssembler::andptr(Register dst, int32_t imm32) {
1009   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1010 }
1011 
1012 void MacroAssembler::atomic_incl(Address counter_addr) {
1013   lock();
1014   incrementl(counter_addr);
1015 }
1016 
1017 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1018   if (reachable(counter_addr)) {
1019     atomic_incl(as_Address(counter_addr));
1020   } else {
1021     lea(scr, counter_addr);
1022     atomic_incl(Address(scr, 0));
1023   }
1024 }
1025 
1026 #ifdef _LP64
1027 void MacroAssembler::atomic_incq(Address counter_addr) {
1028   lock();
1029   incrementq(counter_addr);
1030 }
1031 
1032 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1033   if (reachable(counter_addr)) {
1034     atomic_incq(as_Address(counter_addr));
1035   } else {
1036     lea(scr, counter_addr);
1037     atomic_incq(Address(scr, 0));
1038   }
1039 }
1040 #endif
1041 
1042 // Writes to stack successive pages until offset reached to check for
1043 // stack overflow + shadow pages.  This clobbers tmp.
1044 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1045   movptr(tmp, rsp);
1046   // Bang stack for total size given plus shadow page size.
1047   // Bang one page at a time because large size can bang beyond yellow and
1048   // red zones.
1049   Label loop;
1050   bind(loop);
1051   movl(Address(tmp, (-os::vm_page_size())), size );
1052   subptr(tmp, os::vm_page_size());
1053   subl(size, os::vm_page_size());
1054   jcc(Assembler::greater, loop);
1055 
1056   // Bang down shadow pages too.
1057   // At this point, (tmp-0) is the last address touched, so don't
1058   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1059   // was post-decremented.)  Skip this address by starting at i=1, and
1060   // touch a few more pages below.  N.B.  It is important to touch all
1061   // the way down including all pages in the shadow zone.
1062   for (int i = 1; i < ((int)JavaThread::stack_shadow_zone_size() / os::vm_page_size()); i++) {
1063     // this could be any sized move but this is can be a debugging crumb
1064     // so the bigger the better.
1065     movptr(Address(tmp, (-i*os::vm_page_size())), size );
1066   }
1067 }
1068 
1069 void MacroAssembler::reserved_stack_check() {
1070     // testing if reserved zone needs to be enabled
1071     Label no_reserved_zone_enabling;
1072     Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1073     NOT_LP64(get_thread(rsi);)
1074 
1075     cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1076     jcc(Assembler::below, no_reserved_zone_enabling);
1077 
1078     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1079     jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1080     should_not_reach_here();
1081 
1082     bind(no_reserved_zone_enabling);
1083 }
1084 
1085 int MacroAssembler::biased_locking_enter(Register lock_reg,
1086                                          Register obj_reg,
1087                                          Register swap_reg,
1088                                          Register tmp_reg,
1089                                          Register tmp_reg2,
1090                                          bool swap_reg_contains_mark,
1091                                          Label& done,
1092                                          Label* slow_case,
1093                                          BiasedLockingCounters* counters) {
1094   assert(UseBiasedLocking, "why call this otherwise?");
1095   assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
1096   assert(tmp_reg != noreg, "tmp_reg must be supplied");
1097   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
1098   assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits, "biased locking makes assumptions about bit layout");
1099   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
1100   NOT_LP64( Address saved_mark_addr(lock_reg, 0); )
1101 
1102   if (PrintBiasedLockingStatistics && counters == NULL) {
1103     counters = BiasedLocking::counters();
1104   }
1105   // Biased locking
1106   // See whether the lock is currently biased toward our thread and
1107   // whether the epoch is still valid
1108   // Note that the runtime guarantees sufficient alignment of JavaThread
1109   // pointers to allow age to be placed into low bits
1110   // First check to see whether biasing is even enabled for this object
1111   Label cas_label;
1112   int null_check_offset = -1;
1113   if (!swap_reg_contains_mark) {
1114     null_check_offset = offset();
1115     movptr(swap_reg, mark_addr);
1116   }
1117   movptr(tmp_reg, swap_reg);
1118   andptr(tmp_reg, markWord::biased_lock_mask_in_place);
1119   cmpptr(tmp_reg, markWord::biased_lock_pattern);
1120   jcc(Assembler::notEqual, cas_label);
1121   // The bias pattern is present in the object's header. Need to check
1122   // whether the bias owner and the epoch are both still current.
1123 #ifndef _LP64
1124   // Note that because there is no current thread register on x86_32 we
1125   // need to store off the mark word we read out of the object to
1126   // avoid reloading it and needing to recheck invariants below. This
1127   // store is unfortunate but it makes the overall code shorter and
1128   // simpler.
1129   movptr(saved_mark_addr, swap_reg);
1130 #endif
1131   if (swap_reg_contains_mark) {
1132     null_check_offset = offset();
1133   }
1134   load_prototype_header(tmp_reg, obj_reg, tmp_reg2);
1135 #ifdef _LP64
1136   orptr(tmp_reg, r15_thread);
1137   xorptr(tmp_reg, swap_reg);
1138   Register header_reg = tmp_reg;
1139 #else
1140   xorptr(tmp_reg, swap_reg);
1141   get_thread(swap_reg);
1142   xorptr(swap_reg, tmp_reg);
1143   Register header_reg = swap_reg;
1144 #endif
1145   andptr(header_reg, ~((int) markWord::age_mask_in_place));
1146   if (counters != NULL) {
1147     cond_inc32(Assembler::zero,
1148                ExternalAddress((address) counters->biased_lock_entry_count_addr()));
1149   }
1150   jcc(Assembler::equal, done);
1151 
1152   Label try_revoke_bias;
1153   Label try_rebias;
1154 
1155   // At this point we know that the header has the bias pattern and
1156   // that we are not the bias owner in the current epoch. We need to
1157   // figure out more details about the state of the header in order to
1158   // know what operations can be legally performed on the object's
1159   // header.
1160 
1161   // If the low three bits in the xor result aren't clear, that means
1162   // the prototype header is no longer biased and we have to revoke
1163   // the bias on this object.
1164   testptr(header_reg, markWord::biased_lock_mask_in_place);
1165   jccb(Assembler::notZero, try_revoke_bias);
1166 
1167   // Biasing is still enabled for this data type. See whether the
1168   // epoch of the current bias is still valid, meaning that the epoch
1169   // bits of the mark word are equal to the epoch bits of the
1170   // prototype header. (Note that the prototype header's epoch bits
1171   // only change at a safepoint.) If not, attempt to rebias the object
1172   // toward the current thread. Note that we must be absolutely sure
1173   // that the current epoch is invalid in order to do this because
1174   // otherwise the manipulations it performs on the mark word are
1175   // illegal.
1176   testptr(header_reg, markWord::epoch_mask_in_place);
1177   jccb(Assembler::notZero, try_rebias);
1178 
1179   // The epoch of the current bias is still valid but we know nothing
1180   // about the owner; it might be set or it might be clear. Try to
1181   // acquire the bias of the object using an atomic operation. If this
1182   // fails we will go in to the runtime to revoke the object's bias.
1183   // Note that we first construct the presumed unbiased header so we
1184   // don't accidentally blow away another thread's valid bias.
1185   NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1186   andptr(swap_reg,
1187          markWord::biased_lock_mask_in_place | markWord::age_mask_in_place | markWord::epoch_mask_in_place);
1188 #ifdef _LP64
1189   movptr(tmp_reg, swap_reg);
1190   orptr(tmp_reg, r15_thread);
1191 #else
1192   get_thread(tmp_reg);
1193   orptr(tmp_reg, swap_reg);
1194 #endif
1195   lock();
1196   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1197   // If the biasing toward our thread failed, this means that
1198   // another thread succeeded in biasing it toward itself and we
1199   // need to revoke that bias. The revocation will occur in the
1200   // interpreter runtime in the slow case.
1201   if (counters != NULL) {
1202     cond_inc32(Assembler::zero,
1203                ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
1204   }
1205   if (slow_case != NULL) {
1206     jcc(Assembler::notZero, *slow_case);
1207   }
1208   jmp(done);
1209 
1210   bind(try_rebias);
1211   // At this point we know the epoch has expired, meaning that the
1212   // current "bias owner", if any, is actually invalid. Under these
1213   // circumstances _only_, we are allowed to use the current header's
1214   // value as the comparison value when doing the cas to acquire the
1215   // bias in the current epoch. In other words, we allow transfer of
1216   // the bias from one thread to another directly in this situation.
1217   //
1218   // FIXME: due to a lack of registers we currently blow away the age
1219   // bits in this situation. Should attempt to preserve them.
1220   load_prototype_header(tmp_reg, obj_reg, tmp_reg2);
1221 #ifdef _LP64
1222   orptr(tmp_reg, r15_thread);
1223 #else
1224   get_thread(swap_reg);
1225   orptr(tmp_reg, swap_reg);
1226   movptr(swap_reg, saved_mark_addr);
1227 #endif
1228   lock();
1229   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1230   // If the biasing toward our thread failed, then another thread
1231   // succeeded in biasing it toward itself and we need to revoke that
1232   // bias. The revocation will occur in the runtime in the slow case.
1233   if (counters != NULL) {
1234     cond_inc32(Assembler::zero,
1235                ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
1236   }
1237   if (slow_case != NULL) {
1238     jcc(Assembler::notZero, *slow_case);
1239   }
1240   jmp(done);
1241 
1242   bind(try_revoke_bias);
1243   // The prototype mark in the klass doesn't have the bias bit set any
1244   // more, indicating that objects of this data type are not supposed
1245   // to be biased any more. We are going to try to reset the mark of
1246   // this object to the prototype value and fall through to the
1247   // CAS-based locking scheme. Note that if our CAS fails, it means
1248   // that another thread raced us for the privilege of revoking the
1249   // bias of this particular object, so it's okay to continue in the
1250   // normal locking code.
1251   //
1252   // FIXME: due to a lack of registers we currently blow away the age
1253   // bits in this situation. Should attempt to preserve them.
1254   NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1255   load_prototype_header(tmp_reg, obj_reg, tmp_reg2);
1256   lock();
1257   cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1258   // Fall through to the normal CAS-based lock, because no matter what
1259   // the result of the above CAS, some thread must have succeeded in
1260   // removing the bias bit from the object's header.
1261   if (counters != NULL) {
1262     cond_inc32(Assembler::zero,
1263                ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
1264   }
1265 
1266   bind(cas_label);
1267 
1268   return null_check_offset;
1269 }
1270 
1271 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1272   assert(UseBiasedLocking, "why call this otherwise?");
1273 
1274   // Check for biased locking unlock case, which is a no-op
1275   // Note: we do not have to check the thread ID for two reasons.
1276   // First, the interpreter checks for IllegalMonitorStateException at
1277   // a higher level. Second, if the bias was revoked while we held the
1278   // lock, the object could not be rebiased toward another thread, so
1279   // the bias bit would be clear.
1280   movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1281   andptr(temp_reg, markWord::biased_lock_mask_in_place);
1282   cmpptr(temp_reg, markWord::biased_lock_pattern);
1283   jcc(Assembler::equal, done);
1284 }
1285 
1286 void MacroAssembler::c2bool(Register x) {
1287   // implements x == 0 ? 0 : 1
1288   // note: must only look at least-significant byte of x
1289   //       since C-style booleans are stored in one byte
1290   //       only! (was bug)
1291   andl(x, 0xFF);
1292   setb(Assembler::notZero, x);
1293 }
1294 
1295 // Wouldn't need if AddressLiteral version had new name
1296 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
1297   Assembler::call(L, rtype);
1298 }
1299 
1300 void MacroAssembler::call(Register entry) {
1301   Assembler::call(entry);
1302 }
1303 
1304 void MacroAssembler::call(AddressLiteral entry) {
1305   if (reachable(entry)) {
1306     Assembler::call_literal(entry.target(), entry.rspec());
1307   } else {
1308     lea(rscratch1, entry);
1309     Assembler::call(rscratch1);
1310   }
1311 }
1312 
1313 void MacroAssembler::ic_call(address entry, jint method_index) {
1314   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
1315   movptr(rax, (intptr_t)Universe::non_oop_word());
1316   call(AddressLiteral(entry, rh));
1317 }
1318 
1319 // Implementation of call_VM versions
1320 
1321 void MacroAssembler::call_VM(Register oop_result,
1322                              address entry_point,
1323                              bool check_exceptions) {
1324   Label C, E;
1325   call(C, relocInfo::none);
1326   jmp(E);
1327 
1328   bind(C);
1329   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
1330   ret(0);
1331 
1332   bind(E);
1333 }
1334 
1335 void MacroAssembler::call_VM(Register oop_result,
1336                              address entry_point,
1337                              Register arg_1,
1338                              bool check_exceptions) {
1339   Label C, E;
1340   call(C, relocInfo::none);
1341   jmp(E);
1342 
1343   bind(C);
1344   pass_arg1(this, arg_1);
1345   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
1346   ret(0);
1347 
1348   bind(E);
1349 }
1350 
1351 void MacroAssembler::call_VM(Register oop_result,
1352                              address entry_point,
1353                              Register arg_1,
1354                              Register arg_2,
1355                              bool check_exceptions) {
1356   Label C, E;
1357   call(C, relocInfo::none);
1358   jmp(E);
1359 
1360   bind(C);
1361 
1362   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1363 
1364   pass_arg2(this, arg_2);
1365   pass_arg1(this, arg_1);
1366   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
1367   ret(0);
1368 
1369   bind(E);
1370 }
1371 
1372 void MacroAssembler::call_VM(Register oop_result,
1373                              address entry_point,
1374                              Register arg_1,
1375                              Register arg_2,
1376                              Register arg_3,
1377                              bool check_exceptions) {
1378   Label C, E;
1379   call(C, relocInfo::none);
1380   jmp(E);
1381 
1382   bind(C);
1383 
1384   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1385   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1386   pass_arg3(this, arg_3);
1387 
1388   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1389   pass_arg2(this, arg_2);
1390 
1391   pass_arg1(this, arg_1);
1392   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
1393   ret(0);
1394 
1395   bind(E);
1396 }
1397 
1398 void MacroAssembler::call_VM(Register oop_result,
1399                              Register last_java_sp,
1400                              address entry_point,
1401                              int number_of_arguments,
1402                              bool check_exceptions) {
1403   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1404   call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1405 }
1406 
1407 void MacroAssembler::call_VM(Register oop_result,
1408                              Register last_java_sp,
1409                              address entry_point,
1410                              Register arg_1,
1411                              bool check_exceptions) {
1412   pass_arg1(this, arg_1);
1413   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1414 }
1415 
1416 void MacroAssembler::call_VM(Register oop_result,
1417                              Register last_java_sp,
1418                              address entry_point,
1419                              Register arg_1,
1420                              Register arg_2,
1421                              bool check_exceptions) {
1422 
1423   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1424   pass_arg2(this, arg_2);
1425   pass_arg1(this, arg_1);
1426   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1427 }
1428 
1429 void MacroAssembler::call_VM(Register oop_result,
1430                              Register last_java_sp,
1431                              address entry_point,
1432                              Register arg_1,
1433                              Register arg_2,
1434                              Register arg_3,
1435                              bool check_exceptions) {
1436   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1437   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1438   pass_arg3(this, arg_3);
1439   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1440   pass_arg2(this, arg_2);
1441   pass_arg1(this, arg_1);
1442   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1443 }
1444 
1445 void MacroAssembler::super_call_VM(Register oop_result,
1446                                    Register last_java_sp,
1447                                    address entry_point,
1448                                    int number_of_arguments,
1449                                    bool check_exceptions) {
1450   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
1451   MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
1452 }
1453 
1454 void MacroAssembler::super_call_VM(Register oop_result,
1455                                    Register last_java_sp,
1456                                    address entry_point,
1457                                    Register arg_1,
1458                                    bool check_exceptions) {
1459   pass_arg1(this, arg_1);
1460   super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
1461 }
1462 
1463 void MacroAssembler::super_call_VM(Register oop_result,
1464                                    Register last_java_sp,
1465                                    address entry_point,
1466                                    Register arg_1,
1467                                    Register arg_2,
1468                                    bool check_exceptions) {
1469 
1470   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1471   pass_arg2(this, arg_2);
1472   pass_arg1(this, arg_1);
1473   super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
1474 }
1475 
1476 void MacroAssembler::super_call_VM(Register oop_result,
1477                                    Register last_java_sp,
1478                                    address entry_point,
1479                                    Register arg_1,
1480                                    Register arg_2,
1481                                    Register arg_3,
1482                                    bool check_exceptions) {
1483   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1484   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1485   pass_arg3(this, arg_3);
1486   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1487   pass_arg2(this, arg_2);
1488   pass_arg1(this, arg_1);
1489   super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
1490 }
1491 
1492 void MacroAssembler::call_VM_base(Register oop_result,
1493                                   Register java_thread,
1494                                   Register last_java_sp,
1495                                   address  entry_point,
1496                                   int      number_of_arguments,
1497                                   bool     check_exceptions) {
1498   // determine java_thread register
1499   if (!java_thread->is_valid()) {
1500 #ifdef _LP64
1501     java_thread = r15_thread;
1502 #else
1503     java_thread = rdi;
1504     get_thread(java_thread);
1505 #endif // LP64
1506   }
1507   // determine last_java_sp register
1508   if (!last_java_sp->is_valid()) {
1509     last_java_sp = rsp;
1510   }
1511   // debugging support
1512   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
1513   LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
1514 #ifdef ASSERT
1515   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
1516   // r12 is the heapbase.
1517   LP64_ONLY(if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
1518 #endif // ASSERT
1519 
1520   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
1521   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
1522 
1523   // push java thread (becomes first argument of C function)
1524 
1525   NOT_LP64(push(java_thread); number_of_arguments++);
1526   LP64_ONLY(mov(c_rarg0, r15_thread));
1527 
1528   // set last Java frame before call
1529   assert(last_java_sp != rbp, "can't use ebp/rbp");
1530 
1531   // Only interpreter should have to set fp
1532   set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
1533 
1534   // do the call, remove parameters
1535   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
1536 
1537   // restore the thread (cannot use the pushed argument since arguments
1538   // may be overwritten by C code generated by an optimizing compiler);
1539   // however can use the register value directly if it is callee saved.
1540   if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
1541     // rdi & rsi (also r15) are callee saved -> nothing to do
1542 #ifdef ASSERT
1543     guarantee(java_thread != rax, "change this code");
1544     push(rax);
1545     { Label L;
1546       get_thread(rax);
1547       cmpptr(java_thread, rax);
1548       jcc(Assembler::equal, L);
1549       STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
1550       bind(L);
1551     }
1552     pop(rax);
1553 #endif
1554   } else {
1555     get_thread(java_thread);
1556   }
1557   // reset last Java frame
1558   // Only interpreter should have to clear fp
1559   reset_last_Java_frame(java_thread, true);
1560 
1561    // C++ interp handles this in the interpreter
1562   check_and_handle_popframe(java_thread);
1563   check_and_handle_earlyret(java_thread);
1564 
1565   if (check_exceptions) {
1566     // check for pending exceptions (java_thread is set upon return)
1567     cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
1568 #ifndef _LP64
1569     jump_cc(Assembler::notEqual,
1570             RuntimeAddress(StubRoutines::forward_exception_entry()));
1571 #else
1572     // This used to conditionally jump to forward_exception however it is
1573     // possible if we relocate that the branch will not reach. So we must jump
1574     // around so we can always reach
1575 
1576     Label ok;
1577     jcc(Assembler::equal, ok);
1578     jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1579     bind(ok);
1580 #endif // LP64
1581   }
1582 
1583   // get oop result if there is one and reset the value in the thread
1584   if (oop_result->is_valid()) {
1585     get_vm_result(oop_result, java_thread);
1586   }
1587 }
1588 
1589 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
1590 
1591   // Calculate the value for last_Java_sp
1592   // somewhat subtle. call_VM does an intermediate call
1593   // which places a return address on the stack just under the
1594   // stack pointer as the user finsihed with it. This allows
1595   // use to retrieve last_Java_pc from last_Java_sp[-1].
1596   // On 32bit we then have to push additional args on the stack to accomplish
1597   // the actual requested call. On 64bit call_VM only can use register args
1598   // so the only extra space is the return address that call_VM created.
1599   // This hopefully explains the calculations here.
1600 
1601 #ifdef _LP64
1602   // We've pushed one address, correct last_Java_sp
1603   lea(rax, Address(rsp, wordSize));
1604 #else
1605   lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
1606 #endif // LP64
1607 
1608   call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
1609 
1610 }
1611 
1612 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter.
1613 void MacroAssembler::call_VM_leaf0(address entry_point) {
1614   MacroAssembler::call_VM_leaf_base(entry_point, 0);
1615 }
1616 
1617 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1618   call_VM_leaf_base(entry_point, number_of_arguments);
1619 }
1620 
1621 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1622   pass_arg0(this, arg_0);
1623   call_VM_leaf(entry_point, 1);
1624 }
1625 
1626 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1627 
1628   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1629   pass_arg1(this, arg_1);
1630   pass_arg0(this, arg_0);
1631   call_VM_leaf(entry_point, 2);
1632 }
1633 
1634 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1635   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1636   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1637   pass_arg2(this, arg_2);
1638   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1639   pass_arg1(this, arg_1);
1640   pass_arg0(this, arg_0);
1641   call_VM_leaf(entry_point, 3);
1642 }
1643 
1644 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1645   pass_arg0(this, arg_0);
1646   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1647 }
1648 
1649 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1650 
1651   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1652   pass_arg1(this, arg_1);
1653   pass_arg0(this, arg_0);
1654   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1655 }
1656 
1657 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1658   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1659   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1660   pass_arg2(this, arg_2);
1661   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1662   pass_arg1(this, arg_1);
1663   pass_arg0(this, arg_0);
1664   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1665 }
1666 
1667 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1668   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
1669   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
1670   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
1671   pass_arg3(this, arg_3);
1672   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
1673   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
1674   pass_arg2(this, arg_2);
1675   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
1676   pass_arg1(this, arg_1);
1677   pass_arg0(this, arg_0);
1678   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1679 }
1680 
1681 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
1682   movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
1683   movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
1684   verify_oop_msg(oop_result, "broken oop in call_VM_base");
1685 }
1686 
1687 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
1688   movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
1689   movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
1690 }
1691 
1692 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
1693 }
1694 
1695 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
1696 }
1697 
1698 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
1699   if (reachable(src1)) {
1700     cmpl(as_Address(src1), imm);
1701   } else {
1702     lea(rscratch1, src1);
1703     cmpl(Address(rscratch1, 0), imm);
1704   }
1705 }
1706 
1707 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
1708   assert(!src2.is_lval(), "use cmpptr");
1709   if (reachable(src2)) {
1710     cmpl(src1, as_Address(src2));
1711   } else {
1712     lea(rscratch1, src2);
1713     cmpl(src1, Address(rscratch1, 0));
1714   }
1715 }
1716 
1717 void MacroAssembler::cmp32(Register src1, int32_t imm) {
1718   Assembler::cmpl(src1, imm);
1719 }
1720 
1721 void MacroAssembler::cmp32(Register src1, Address src2) {
1722   Assembler::cmpl(src1, src2);
1723 }
1724 
1725 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1726   ucomisd(opr1, opr2);
1727 
1728   Label L;
1729   if (unordered_is_less) {
1730     movl(dst, -1);
1731     jcc(Assembler::parity, L);
1732     jcc(Assembler::below , L);
1733     movl(dst, 0);
1734     jcc(Assembler::equal , L);
1735     increment(dst);
1736   } else { // unordered is greater
1737     movl(dst, 1);
1738     jcc(Assembler::parity, L);
1739     jcc(Assembler::above , L);
1740     movl(dst, 0);
1741     jcc(Assembler::equal , L);
1742     decrementl(dst);
1743   }
1744   bind(L);
1745 }
1746 
1747 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
1748   ucomiss(opr1, opr2);
1749 
1750   Label L;
1751   if (unordered_is_less) {
1752     movl(dst, -1);
1753     jcc(Assembler::parity, L);
1754     jcc(Assembler::below , L);
1755     movl(dst, 0);
1756     jcc(Assembler::equal , L);
1757     increment(dst);
1758   } else { // unordered is greater
1759     movl(dst, 1);
1760     jcc(Assembler::parity, L);
1761     jcc(Assembler::above , L);
1762     movl(dst, 0);
1763     jcc(Assembler::equal , L);
1764     decrementl(dst);
1765   }
1766   bind(L);
1767 }
1768 
1769 
1770 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
1771   if (reachable(src1)) {
1772     cmpb(as_Address(src1), imm);
1773   } else {
1774     lea(rscratch1, src1);
1775     cmpb(Address(rscratch1, 0), imm);
1776   }
1777 }
1778 
1779 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
1780 #ifdef _LP64
1781   if (src2.is_lval()) {
1782     movptr(rscratch1, src2);
1783     Assembler::cmpq(src1, rscratch1);
1784   } else if (reachable(src2)) {
1785     cmpq(src1, as_Address(src2));
1786   } else {
1787     lea(rscratch1, src2);
1788     Assembler::cmpq(src1, Address(rscratch1, 0));
1789   }
1790 #else
1791   if (src2.is_lval()) {
1792     cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
1793   } else {
1794     cmpl(src1, as_Address(src2));
1795   }
1796 #endif // _LP64
1797 }
1798 
1799 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
1800   assert(src2.is_lval(), "not a mem-mem compare");
1801 #ifdef _LP64
1802   // moves src2's literal address
1803   movptr(rscratch1, src2);
1804   Assembler::cmpq(src1, rscratch1);
1805 #else
1806   cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
1807 #endif // _LP64
1808 }
1809 
1810 void MacroAssembler::cmpoop(Register src1, Register src2) {
1811   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1812   bs->obj_equals(this, src1, src2);
1813 }
1814 
1815 void MacroAssembler::cmpoop(Register src1, Address src2) {
1816   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1817   bs->obj_equals(this, src1, src2);
1818 }
1819 
1820 #ifdef _LP64
1821 void MacroAssembler::cmpoop(Register src1, jobject src2) {
1822   movoop(rscratch1, src2);
1823   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1824   bs->obj_equals(this, src1, rscratch1);
1825 }
1826 #endif
1827 
1828 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
1829   if (reachable(adr)) {
1830     lock();
1831     cmpxchgptr(reg, as_Address(adr));
1832   } else {
1833     lea(rscratch1, adr);
1834     lock();
1835     cmpxchgptr(reg, Address(rscratch1, 0));
1836   }
1837 }
1838 
1839 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
1840   LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
1841 }
1842 
1843 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
1844   if (reachable(src)) {
1845     Assembler::comisd(dst, as_Address(src));
1846   } else {
1847     lea(rscratch1, src);
1848     Assembler::comisd(dst, Address(rscratch1, 0));
1849   }
1850 }
1851 
1852 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
1853   if (reachable(src)) {
1854     Assembler::comiss(dst, as_Address(src));
1855   } else {
1856     lea(rscratch1, src);
1857     Assembler::comiss(dst, Address(rscratch1, 0));
1858   }
1859 }
1860 
1861 
1862 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
1863   Condition negated_cond = negate_condition(cond);
1864   Label L;
1865   jcc(negated_cond, L);
1866   pushf(); // Preserve flags
1867   atomic_incl(counter_addr);
1868   popf();
1869   bind(L);
1870 }
1871 
1872 int MacroAssembler::corrected_idivl(Register reg) {
1873   // Full implementation of Java idiv and irem; checks for
1874   // special case as described in JVM spec., p.243 & p.271.
1875   // The function returns the (pc) offset of the idivl
1876   // instruction - may be needed for implicit exceptions.
1877   //
1878   //         normal case                           special case
1879   //
1880   // input : rax,: dividend                         min_int
1881   //         reg: divisor   (may not be rax,/rdx)   -1
1882   //
1883   // output: rax,: quotient  (= rax, idiv reg)       min_int
1884   //         rdx: remainder (= rax, irem reg)       0
1885   assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
1886   const int min_int = 0x80000000;
1887   Label normal_case, special_case;
1888 
1889   // check for special case
1890   cmpl(rax, min_int);
1891   jcc(Assembler::notEqual, normal_case);
1892   xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
1893   cmpl(reg, -1);
1894   jcc(Assembler::equal, special_case);
1895 
1896   // handle normal case
1897   bind(normal_case);
1898   cdql();
1899   int idivl_offset = offset();
1900   idivl(reg);
1901 
1902   // normal and special case exit
1903   bind(special_case);
1904 
1905   return idivl_offset;
1906 }
1907 
1908 
1909 
1910 void MacroAssembler::decrementl(Register reg, int value) {
1911   if (value == min_jint) {subl(reg, value) ; return; }
1912   if (value <  0) { incrementl(reg, -value); return; }
1913   if (value == 0) {                        ; return; }
1914   if (value == 1 && UseIncDec) { decl(reg) ; return; }
1915   /* else */      { subl(reg, value)       ; return; }
1916 }
1917 
1918 void MacroAssembler::decrementl(Address dst, int value) {
1919   if (value == min_jint) {subl(dst, value) ; return; }
1920   if (value <  0) { incrementl(dst, -value); return; }
1921   if (value == 0) {                        ; return; }
1922   if (value == 1 && UseIncDec) { decl(dst) ; return; }
1923   /* else */      { subl(dst, value)       ; return; }
1924 }
1925 
1926 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
1927   assert (shift_value > 0, "illegal shift value");
1928   Label _is_positive;
1929   testl (reg, reg);
1930   jcc (Assembler::positive, _is_positive);
1931   int offset = (1 << shift_value) - 1 ;
1932 
1933   if (offset == 1) {
1934     incrementl(reg);
1935   } else {
1936     addl(reg, offset);
1937   }
1938 
1939   bind (_is_positive);
1940   sarl(reg, shift_value);
1941 }
1942 
1943 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
1944   if (reachable(src)) {
1945     Assembler::divsd(dst, as_Address(src));
1946   } else {
1947     lea(rscratch1, src);
1948     Assembler::divsd(dst, Address(rscratch1, 0));
1949   }
1950 }
1951 
1952 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
1953   if (reachable(src)) {
1954     Assembler::divss(dst, as_Address(src));
1955   } else {
1956     lea(rscratch1, src);
1957     Assembler::divss(dst, Address(rscratch1, 0));
1958   }
1959 }
1960 
1961 void MacroAssembler::enter() {
1962   push(rbp);
1963   mov(rbp, rsp);
1964 }
1965 
1966 // A 5 byte nop that is safe for patching (see patch_verified_entry)
1967 void MacroAssembler::fat_nop() {
1968   if (UseAddressNop) {
1969     addr_nop_5();
1970   } else {
1971     emit_int8(0x26); // es:
1972     emit_int8(0x2e); // cs:
1973     emit_int8(0x64); // fs:
1974     emit_int8(0x65); // gs:
1975     emit_int8((unsigned char)0x90);
1976   }
1977 }
1978 
1979 #ifndef _LP64
1980 void MacroAssembler::fcmp(Register tmp) {
1981   fcmp(tmp, 1, true, true);
1982 }
1983 
1984 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
1985   assert(!pop_right || pop_left, "usage error");
1986   if (VM_Version::supports_cmov()) {
1987     assert(tmp == noreg, "unneeded temp");
1988     if (pop_left) {
1989       fucomip(index);
1990     } else {
1991       fucomi(index);
1992     }
1993     if (pop_right) {
1994       fpop();
1995     }
1996   } else {
1997     assert(tmp != noreg, "need temp");
1998     if (pop_left) {
1999       if (pop_right) {
2000         fcompp();
2001       } else {
2002         fcomp(index);
2003       }
2004     } else {
2005       fcom(index);
2006     }
2007     // convert FPU condition into eflags condition via rax,
2008     save_rax(tmp);
2009     fwait(); fnstsw_ax();
2010     sahf();
2011     restore_rax(tmp);
2012   }
2013   // condition codes set as follows:
2014   //
2015   // CF (corresponds to C0) if x < y
2016   // PF (corresponds to C2) if unordered
2017   // ZF (corresponds to C3) if x = y
2018 }
2019 
2020 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
2021   fcmp2int(dst, unordered_is_less, 1, true, true);
2022 }
2023 
2024 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
2025   fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
2026   Label L;
2027   if (unordered_is_less) {
2028     movl(dst, -1);
2029     jcc(Assembler::parity, L);
2030     jcc(Assembler::below , L);
2031     movl(dst, 0);
2032     jcc(Assembler::equal , L);
2033     increment(dst);
2034   } else { // unordered is greater
2035     movl(dst, 1);
2036     jcc(Assembler::parity, L);
2037     jcc(Assembler::above , L);
2038     movl(dst, 0);
2039     jcc(Assembler::equal , L);
2040     decrementl(dst);
2041   }
2042   bind(L);
2043 }
2044 
2045 void MacroAssembler::fld_d(AddressLiteral src) {
2046   fld_d(as_Address(src));
2047 }
2048 
2049 void MacroAssembler::fld_s(AddressLiteral src) {
2050   fld_s(as_Address(src));
2051 }
2052 
2053 void MacroAssembler::fld_x(AddressLiteral src) {
2054   Assembler::fld_x(as_Address(src));
2055 }
2056 
2057 void MacroAssembler::fldcw(AddressLiteral src) {
2058   Assembler::fldcw(as_Address(src));
2059 }
2060 
2061 void MacroAssembler::fpop() {
2062   ffree();
2063   fincstp();
2064 }
2065 
2066 void MacroAssembler::fremr(Register tmp) {
2067   save_rax(tmp);
2068   { Label L;
2069     bind(L);
2070     fprem();
2071     fwait(); fnstsw_ax();
2072     sahf();
2073     jcc(Assembler::parity, L);
2074   }
2075   restore_rax(tmp);
2076   // Result is in ST0.
2077   // Note: fxch & fpop to get rid of ST1
2078   // (otherwise FPU stack could overflow eventually)
2079   fxch(1);
2080   fpop();
2081 }
2082 
2083 void MacroAssembler::empty_FPU_stack() {
2084   if (VM_Version::supports_mmx()) {
2085     emms();
2086   } else {
2087     for (int i = 8; i-- > 0; ) ffree(i);
2088   }
2089 }
2090 #endif // !LP64
2091 
2092 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
2093   if (reachable(src)) {
2094     Assembler::mulpd(dst, as_Address(src));
2095   } else {
2096     lea(rscratch1, src);
2097     Assembler::mulpd(dst, Address(rscratch1, 0));
2098   }
2099 }
2100 
2101 void MacroAssembler::load_float(Address src) {
2102 #ifdef _LP64
2103   movflt(xmm0, src);
2104 #else
2105   if (UseSSE >= 1) {
2106     movflt(xmm0, src);
2107   } else {
2108     fld_s(src);
2109   }
2110 #endif // LP64
2111 }
2112 
2113 void MacroAssembler::store_float(Address dst) {
2114 #ifdef _LP64
2115   movflt(dst, xmm0);
2116 #else
2117   if (UseSSE >= 1) {
2118     movflt(dst, xmm0);
2119   } else {
2120     fstp_s(dst);
2121   }
2122 #endif // LP64
2123 }
2124 
2125 void MacroAssembler::load_double(Address src) {
2126 #ifdef _LP64
2127   movdbl(xmm0, src);
2128 #else
2129   if (UseSSE >= 2) {
2130     movdbl(xmm0, src);
2131   } else {
2132     fld_d(src);
2133   }
2134 #endif // LP64
2135 }
2136 
2137 void MacroAssembler::store_double(Address dst) {
2138 #ifdef _LP64
2139   movdbl(dst, xmm0);
2140 #else
2141   if (UseSSE >= 2) {
2142     movdbl(dst, xmm0);
2143   } else {
2144     fstp_d(dst);
2145   }
2146 #endif // LP64
2147 }
2148 
2149 // dst = c = a * b + c
2150 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2151   Assembler::vfmadd231sd(c, a, b);
2152   if (dst != c) {
2153     movdbl(dst, c);
2154   }
2155 }
2156 
2157 // dst = c = a * b + c
2158 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
2159   Assembler::vfmadd231ss(c, a, b);
2160   if (dst != c) {
2161     movflt(dst, c);
2162   }
2163 }
2164 
2165 // dst = c = a * b + c
2166 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2167   Assembler::vfmadd231pd(c, a, b, vector_len);
2168   if (dst != c) {
2169     vmovdqu(dst, c);
2170   }
2171 }
2172 
2173 // dst = c = a * b + c
2174 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
2175   Assembler::vfmadd231ps(c, a, b, vector_len);
2176   if (dst != c) {
2177     vmovdqu(dst, c);
2178   }
2179 }
2180 
2181 // dst = c = a * b + c
2182 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2183   Assembler::vfmadd231pd(c, a, b, vector_len);
2184   if (dst != c) {
2185     vmovdqu(dst, c);
2186   }
2187 }
2188 
2189 // dst = c = a * b + c
2190 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
2191   Assembler::vfmadd231ps(c, a, b, vector_len);
2192   if (dst != c) {
2193     vmovdqu(dst, c);
2194   }
2195 }
2196 
2197 void MacroAssembler::incrementl(AddressLiteral dst) {
2198   if (reachable(dst)) {
2199     incrementl(as_Address(dst));
2200   } else {
2201     lea(rscratch1, dst);
2202     incrementl(Address(rscratch1, 0));
2203   }
2204 }
2205 
2206 void MacroAssembler::incrementl(ArrayAddress dst) {
2207   incrementl(as_Address(dst));
2208 }
2209 
2210 void MacroAssembler::incrementl(Register reg, int value) {
2211   if (value == min_jint) {addl(reg, value) ; return; }
2212   if (value <  0) { decrementl(reg, -value); return; }
2213   if (value == 0) {                        ; return; }
2214   if (value == 1 && UseIncDec) { incl(reg) ; return; }
2215   /* else */      { addl(reg, value)       ; return; }
2216 }
2217 
2218 void MacroAssembler::incrementl(Address dst, int value) {
2219   if (value == min_jint) {addl(dst, value) ; return; }
2220   if (value <  0) { decrementl(dst, -value); return; }
2221   if (value == 0) {                        ; return; }
2222   if (value == 1 && UseIncDec) { incl(dst) ; return; }
2223   /* else */      { addl(dst, value)       ; return; }
2224 }
2225 
2226 void MacroAssembler::jump(AddressLiteral dst) {
2227   if (reachable(dst)) {
2228     jmp_literal(dst.target(), dst.rspec());
2229   } else {
2230     lea(rscratch1, dst);
2231     jmp(rscratch1);
2232   }
2233 }
2234 
2235 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
2236   if (reachable(dst)) {
2237     InstructionMark im(this);
2238     relocate(dst.reloc());
2239     const int short_size = 2;
2240     const int long_size = 6;
2241     int offs = (intptr_t)dst.target() - ((intptr_t)pc());
2242     if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
2243       // 0111 tttn #8-bit disp
2244       emit_int8(0x70 | cc);
2245       emit_int8((offs - short_size) & 0xFF);
2246     } else {
2247       // 0000 1111 1000 tttn #32-bit disp
2248       emit_int8(0x0F);
2249       emit_int8((unsigned char)(0x80 | cc));
2250       emit_int32(offs - long_size);
2251     }
2252   } else {
2253 #ifdef ASSERT
2254     warning("reversing conditional branch");
2255 #endif /* ASSERT */
2256     Label skip;
2257     jccb(reverse[cc], skip);
2258     lea(rscratch1, dst);
2259     Assembler::jmp(rscratch1);
2260     bind(skip);
2261   }
2262 }
2263 
2264 void MacroAssembler::ldmxcsr(AddressLiteral src) {
2265   if (reachable(src)) {
2266     Assembler::ldmxcsr(as_Address(src));
2267   } else {
2268     lea(rscratch1, src);
2269     Assembler::ldmxcsr(Address(rscratch1, 0));
2270   }
2271 }
2272 
2273 int MacroAssembler::load_signed_byte(Register dst, Address src) {
2274   int off;
2275   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2276     off = offset();
2277     movsbl(dst, src); // movsxb
2278   } else {
2279     off = load_unsigned_byte(dst, src);
2280     shll(dst, 24);
2281     sarl(dst, 24);
2282   }
2283   return off;
2284 }
2285 
2286 // Note: load_signed_short used to be called load_signed_word.
2287 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
2288 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
2289 // The term "word" in HotSpot means a 32- or 64-bit machine word.
2290 int MacroAssembler::load_signed_short(Register dst, Address src) {
2291   int off;
2292   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2293     // This is dubious to me since it seems safe to do a signed 16 => 64 bit
2294     // version but this is what 64bit has always done. This seems to imply
2295     // that users are only using 32bits worth.
2296     off = offset();
2297     movswl(dst, src); // movsxw
2298   } else {
2299     off = load_unsigned_short(dst, src);
2300     shll(dst, 16);
2301     sarl(dst, 16);
2302   }
2303   return off;
2304 }
2305 
2306 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2307   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2308   // and "3.9 Partial Register Penalties", p. 22).
2309   int off;
2310   if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
2311     off = offset();
2312     movzbl(dst, src); // movzxb
2313   } else {
2314     xorl(dst, dst);
2315     off = offset();
2316     movb(dst, src);
2317   }
2318   return off;
2319 }
2320 
2321 // Note: load_unsigned_short used to be called load_unsigned_word.
2322 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2323   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
2324   // and "3.9 Partial Register Penalties", p. 22).
2325   int off;
2326   if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
2327     off = offset();
2328     movzwl(dst, src); // movzxw
2329   } else {
2330     xorl(dst, dst);
2331     off = offset();
2332     movw(dst, src);
2333   }
2334   return off;
2335 }
2336 
2337 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
2338   switch (size_in_bytes) {
2339 #ifndef _LP64
2340   case  8:
2341     assert(dst2 != noreg, "second dest register required");
2342     movl(dst,  src);
2343     movl(dst2, src.plus_disp(BytesPerInt));
2344     break;
2345 #else
2346   case  8:  movq(dst, src); break;
2347 #endif
2348   case  4:  movl(dst, src); break;
2349   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2350   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2351   default:  ShouldNotReachHere();
2352   }
2353 }
2354 
2355 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
2356   switch (size_in_bytes) {
2357 #ifndef _LP64
2358   case  8:
2359     assert(src2 != noreg, "second source register required");
2360     movl(dst,                        src);
2361     movl(dst.plus_disp(BytesPerInt), src2);
2362     break;
2363 #else
2364   case  8:  movq(dst, src); break;
2365 #endif
2366   case  4:  movl(dst, src); break;
2367   case  2:  movw(dst, src); break;
2368   case  1:  movb(dst, src); break;
2369   default:  ShouldNotReachHere();
2370   }
2371 }
2372 
2373 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
2374   if (reachable(dst)) {
2375     movl(as_Address(dst), src);
2376   } else {
2377     lea(rscratch1, dst);
2378     movl(Address(rscratch1, 0), src);
2379   }
2380 }
2381 
2382 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
2383   if (reachable(src)) {
2384     movl(dst, as_Address(src));
2385   } else {
2386     lea(rscratch1, src);
2387     movl(dst, Address(rscratch1, 0));
2388   }
2389 }
2390 
2391 // C++ bool manipulation
2392 
2393 void MacroAssembler::movbool(Register dst, Address src) {
2394   if(sizeof(bool) == 1)
2395     movb(dst, src);
2396   else if(sizeof(bool) == 2)
2397     movw(dst, src);
2398   else if(sizeof(bool) == 4)
2399     movl(dst, src);
2400   else
2401     // unsupported
2402     ShouldNotReachHere();
2403 }
2404 
2405 void MacroAssembler::movbool(Address dst, bool boolconst) {
2406   if(sizeof(bool) == 1)
2407     movb(dst, (int) boolconst);
2408   else if(sizeof(bool) == 2)
2409     movw(dst, (int) boolconst);
2410   else if(sizeof(bool) == 4)
2411     movl(dst, (int) boolconst);
2412   else
2413     // unsupported
2414     ShouldNotReachHere();
2415 }
2416 
2417 void MacroAssembler::movbool(Address dst, Register src) {
2418   if(sizeof(bool) == 1)
2419     movb(dst, src);
2420   else if(sizeof(bool) == 2)
2421     movw(dst, src);
2422   else if(sizeof(bool) == 4)
2423     movl(dst, src);
2424   else
2425     // unsupported
2426     ShouldNotReachHere();
2427 }
2428 
2429 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
2430   movb(as_Address(dst), src);
2431 }
2432 
2433 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
2434   if (reachable(src)) {
2435     movdl(dst, as_Address(src));
2436   } else {
2437     lea(rscratch1, src);
2438     movdl(dst, Address(rscratch1, 0));
2439   }
2440 }
2441 
2442 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
2443   if (reachable(src)) {
2444     movq(dst, as_Address(src));
2445   } else {
2446     lea(rscratch1, src);
2447     movq(dst, Address(rscratch1, 0));
2448   }
2449 }
2450 
2451 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
2452   if (reachable(src)) {
2453     if (UseXmmLoadAndClearUpper) {
2454       movsd (dst, as_Address(src));
2455     } else {
2456       movlpd(dst, as_Address(src));
2457     }
2458   } else {
2459     lea(rscratch1, src);
2460     if (UseXmmLoadAndClearUpper) {
2461       movsd (dst, Address(rscratch1, 0));
2462     } else {
2463       movlpd(dst, Address(rscratch1, 0));
2464     }
2465   }
2466 }
2467 
2468 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
2469   if (reachable(src)) {
2470     movss(dst, as_Address(src));
2471   } else {
2472     lea(rscratch1, src);
2473     movss(dst, Address(rscratch1, 0));
2474   }
2475 }
2476 
2477 void MacroAssembler::movptr(Register dst, Register src) {
2478   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2479 }
2480 
2481 void MacroAssembler::movptr(Register dst, Address src) {
2482   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2483 }
2484 
2485 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
2486 void MacroAssembler::movptr(Register dst, intptr_t src) {
2487   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
2488 }
2489 
2490 void MacroAssembler::movptr(Address dst, Register src) {
2491   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
2492 }
2493 
2494 void MacroAssembler::movdqu(Address dst, XMMRegister src) {
2495     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2496     Assembler::movdqu(dst, src);
2497 }
2498 
2499 void MacroAssembler::movdqu(XMMRegister dst, Address src) {
2500     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2501     Assembler::movdqu(dst, src);
2502 }
2503 
2504 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
2505     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2506     if (dst->encoding() == src->encoding()) return;
2507     Assembler::movdqu(dst, src);
2508 }
2509 
2510 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) {
2511   if (reachable(src)) {
2512     movdqu(dst, as_Address(src));
2513   } else {
2514     lea(scratchReg, src);
2515     movdqu(dst, Address(scratchReg, 0));
2516   }
2517 }
2518 
2519 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
2520     assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2521     Assembler::vmovdqu(dst, src);
2522 }
2523 
2524 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
2525     assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2526     Assembler::vmovdqu(dst, src);
2527 }
2528 
2529 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
2530     assert(((dst->encoding() < 16  && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
2531     if (dst->encoding() == src->encoding()) return;
2532     Assembler::vmovdqu(dst, src);
2533 }
2534 
2535 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
2536   if (reachable(src)) {
2537     vmovdqu(dst, as_Address(src));
2538   }
2539   else {
2540     lea(scratch_reg, src);
2541     vmovdqu(dst, Address(scratch_reg, 0));
2542   }
2543 }
2544 
2545 
2546 void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register scratch_reg) {
2547   if (reachable(src)) {
2548     kmovwl(dst, as_Address(src));
2549   } else {
2550     lea(scratch_reg, src);
2551     kmovwl(dst, Address(scratch_reg, 0));
2552   }
2553 }
2554 
2555 void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2556                                int vector_len, Register scratch_reg) {
2557   if (reachable(src)) {
2558     if (mask == k0) {
2559       Assembler::evmovdqub(dst, as_Address(src), merge, vector_len);
2560     } else {
2561       Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len);
2562     }
2563   } else {
2564     lea(scratch_reg, src);
2565     if (mask == k0) {
2566       Assembler::evmovdqub(dst, Address(scratch_reg, 0), merge, vector_len);
2567     } else {
2568       Assembler::evmovdqub(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2569     }
2570   }
2571 }
2572 
2573 void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2574                                int vector_len, Register scratch_reg) {
2575   if (reachable(src)) {
2576     Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len);
2577   } else {
2578     lea(scratch_reg, src);
2579     Assembler::evmovdquw(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2580   }
2581 }
2582 
2583 void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2584                                int vector_len, Register scratch_reg) {
2585   if (reachable(src)) {
2586     Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len);
2587   } else {
2588     lea(scratch_reg, src);
2589     Assembler::evmovdqul(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2590   }
2591 }
2592 
2593 void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge,
2594                                int vector_len, Register scratch_reg) {
2595   if (reachable(src)) {
2596     Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len);
2597   } else {
2598     lea(scratch_reg, src);
2599     Assembler::evmovdquq(dst, mask, Address(scratch_reg, 0), merge, vector_len);
2600   }
2601 }
2602 
2603 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) {
2604   if (reachable(src)) {
2605     Assembler::evmovdquq(dst, as_Address(src), vector_len);
2606   } else {
2607     lea(rscratch, src);
2608     Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len);
2609   }
2610 }
2611 
2612 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
2613   if (reachable(src)) {
2614     Assembler::movdqa(dst, as_Address(src));
2615   } else {
2616     lea(rscratch1, src);
2617     Assembler::movdqa(dst, Address(rscratch1, 0));
2618   }
2619 }
2620 
2621 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
2622   if (reachable(src)) {
2623     Assembler::movsd(dst, as_Address(src));
2624   } else {
2625     lea(rscratch1, src);
2626     Assembler::movsd(dst, Address(rscratch1, 0));
2627   }
2628 }
2629 
2630 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
2631   if (reachable(src)) {
2632     Assembler::movss(dst, as_Address(src));
2633   } else {
2634     lea(rscratch1, src);
2635     Assembler::movss(dst, Address(rscratch1, 0));
2636   }
2637 }
2638 
2639 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
2640   if (reachable(src)) {
2641     Assembler::mulsd(dst, as_Address(src));
2642   } else {
2643     lea(rscratch1, src);
2644     Assembler::mulsd(dst, Address(rscratch1, 0));
2645   }
2646 }
2647 
2648 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
2649   if (reachable(src)) {
2650     Assembler::mulss(dst, as_Address(src));
2651   } else {
2652     lea(rscratch1, src);
2653     Assembler::mulss(dst, Address(rscratch1, 0));
2654   }
2655 }
2656 
2657 void MacroAssembler::null_check(Register reg, int offset) {
2658   if (needs_explicit_null_check(offset)) {
2659     // provoke OS NULL exception if reg = NULL by
2660     // accessing M[reg] w/o changing any (non-CC) registers
2661     // NOTE: cmpl is plenty here to provoke a segv
2662     cmpptr(rax, Address(reg, 0));
2663     // Note: should probably use testl(rax, Address(reg, 0));
2664     //       may be shorter code (however, this version of
2665     //       testl needs to be implemented first)
2666   } else {
2667     // nothing to do, (later) access of M[reg + offset]
2668     // will provoke OS NULL exception if reg = NULL
2669   }
2670 }
2671 
2672 void MacroAssembler::os_breakpoint() {
2673   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
2674   // (e.g., MSVC can't call ps() otherwise)
2675   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
2676 }
2677 
2678 void MacroAssembler::unimplemented(const char* what) {
2679   const char* buf = NULL;
2680   {
2681     ResourceMark rm;
2682     stringStream ss;
2683     ss.print("unimplemented: %s", what);
2684     buf = code_string(ss.as_string());
2685   }
2686   stop(buf);
2687 }
2688 
2689 #ifdef _LP64
2690 #define XSTATE_BV 0x200
2691 #endif
2692 
2693 void MacroAssembler::pop_CPU_state() {
2694   pop_FPU_state();
2695   pop_IU_state();
2696 }
2697 
2698 void MacroAssembler::pop_FPU_state() {
2699 #ifndef _LP64
2700   frstor(Address(rsp, 0));
2701 #else
2702   fxrstor(Address(rsp, 0));
2703 #endif
2704   addptr(rsp, FPUStateSizeInWords * wordSize);
2705 }
2706 
2707 void MacroAssembler::pop_IU_state() {
2708   popa();
2709   LP64_ONLY(addq(rsp, 8));
2710   popf();
2711 }
2712 
2713 // Save Integer and Float state
2714 // Warning: Stack must be 16 byte aligned (64bit)
2715 void MacroAssembler::push_CPU_state() {
2716   push_IU_state();
2717   push_FPU_state();
2718 }
2719 
2720 void MacroAssembler::push_FPU_state() {
2721   subptr(rsp, FPUStateSizeInWords * wordSize);
2722 #ifndef _LP64
2723   fnsave(Address(rsp, 0));
2724   fwait();
2725 #else
2726   fxsave(Address(rsp, 0));
2727 #endif // LP64
2728 }
2729 
2730 void MacroAssembler::push_IU_state() {
2731   // Push flags first because pusha kills them
2732   pushf();
2733   // Make sure rsp stays 16-byte aligned
2734   LP64_ONLY(subq(rsp, 8));
2735   pusha();
2736 }
2737 
2738 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register
2739   if (!java_thread->is_valid()) {
2740     java_thread = rdi;
2741     get_thread(java_thread);
2742   }
2743   // we must set sp to zero to clear frame
2744   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
2745   if (clear_fp) {
2746     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
2747   }
2748 
2749   // Always clear the pc because it could have been set by make_walkable()
2750   movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
2751 
2752   vzeroupper();
2753 }
2754 
2755 void MacroAssembler::restore_rax(Register tmp) {
2756   if (tmp == noreg) pop(rax);
2757   else if (tmp != rax) mov(rax, tmp);
2758 }
2759 
2760 void MacroAssembler::round_to(Register reg, int modulus) {
2761   addptr(reg, modulus - 1);
2762   andptr(reg, -modulus);
2763 }
2764 
2765 void MacroAssembler::save_rax(Register tmp) {
2766   if (tmp == noreg) push(rax);
2767   else if (tmp != rax) mov(tmp, rax);
2768 }
2769 
2770 void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, Register temp_reg) {
2771 #ifdef _LP64
2772   assert(thread_reg == r15_thread, "should be");
2773 #else
2774   if (thread_reg == noreg) {
2775     thread_reg = temp_reg;
2776     get_thread(thread_reg);
2777   }
2778 #endif
2779   testb(Address(thread_reg, Thread::polling_page_offset()), SafepointMechanism::poll_bit());
2780   jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
2781 }
2782 
2783 // Calls to C land
2784 //
2785 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
2786 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
2787 // has to be reset to 0. This is required to allow proper stack traversal.
2788 void MacroAssembler::set_last_Java_frame(Register java_thread,
2789                                          Register last_java_sp,
2790                                          Register last_java_fp,
2791                                          address  last_java_pc) {
2792   vzeroupper();
2793   // determine java_thread register
2794   if (!java_thread->is_valid()) {
2795     java_thread = rdi;
2796     get_thread(java_thread);
2797   }
2798   // determine last_java_sp register
2799   if (!last_java_sp->is_valid()) {
2800     last_java_sp = rsp;
2801   }
2802 
2803   // last_java_fp is optional
2804 
2805   if (last_java_fp->is_valid()) {
2806     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
2807   }
2808 
2809   // last_java_pc is optional
2810 
2811   if (last_java_pc != NULL) {
2812     lea(Address(java_thread,
2813                  JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
2814         InternalAddress(last_java_pc));
2815 
2816   }
2817   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
2818 }
2819 
2820 void MacroAssembler::shlptr(Register dst, int imm8) {
2821   LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
2822 }
2823 
2824 void MacroAssembler::shrptr(Register dst, int imm8) {
2825   LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
2826 }
2827 
2828 void MacroAssembler::sign_extend_byte(Register reg) {
2829   if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
2830     movsbl(reg, reg); // movsxb
2831   } else {
2832     shll(reg, 24);
2833     sarl(reg, 24);
2834   }
2835 }
2836 
2837 void MacroAssembler::sign_extend_short(Register reg) {
2838   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
2839     movswl(reg, reg); // movsxw
2840   } else {
2841     shll(reg, 16);
2842     sarl(reg, 16);
2843   }
2844 }
2845 
2846 void MacroAssembler::testl(Register dst, AddressLiteral src) {
2847   assert(reachable(src), "Address should be reachable");
2848   testl(dst, as_Address(src));
2849 }
2850 
2851 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
2852   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2853   Assembler::pcmpeqb(dst, src);
2854 }
2855 
2856 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
2857   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2858   Assembler::pcmpeqw(dst, src);
2859 }
2860 
2861 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
2862   assert((dst->encoding() < 16),"XMM register should be 0-15");
2863   Assembler::pcmpestri(dst, src, imm8);
2864 }
2865 
2866 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
2867   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
2868   Assembler::pcmpestri(dst, src, imm8);
2869 }
2870 
2871 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
2872   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2873   Assembler::pmovzxbw(dst, src);
2874 }
2875 
2876 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
2877   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
2878   Assembler::pmovzxbw(dst, src);
2879 }
2880 
2881 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
2882   assert((src->encoding() < 16),"XMM register should be 0-15");
2883   Assembler::pmovmskb(dst, src);
2884 }
2885 
2886 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
2887   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
2888   Assembler::ptest(dst, src);
2889 }
2890 
2891 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
2892   if (reachable(src)) {
2893     Assembler::sqrtsd(dst, as_Address(src));
2894   } else {
2895     lea(rscratch1, src);
2896     Assembler::sqrtsd(dst, Address(rscratch1, 0));
2897   }
2898 }
2899 
2900 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
2901   if (reachable(src)) {
2902     Assembler::sqrtss(dst, as_Address(src));
2903   } else {
2904     lea(rscratch1, src);
2905     Assembler::sqrtss(dst, Address(rscratch1, 0));
2906   }
2907 }
2908 
2909 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
2910   if (reachable(src)) {
2911     Assembler::subsd(dst, as_Address(src));
2912   } else {
2913     lea(rscratch1, src);
2914     Assembler::subsd(dst, Address(rscratch1, 0));
2915   }
2916 }
2917 
2918 void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register scratch_reg) {
2919   if (reachable(src)) {
2920     Assembler::roundsd(dst, as_Address(src), rmode);
2921   } else {
2922     lea(scratch_reg, src);
2923     Assembler::roundsd(dst, Address(scratch_reg, 0), rmode);
2924   }
2925 }
2926 
2927 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
2928   if (reachable(src)) {
2929     Assembler::subss(dst, as_Address(src));
2930   } else {
2931     lea(rscratch1, src);
2932     Assembler::subss(dst, Address(rscratch1, 0));
2933   }
2934 }
2935 
2936 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
2937   if (reachable(src)) {
2938     Assembler::ucomisd(dst, as_Address(src));
2939   } else {
2940     lea(rscratch1, src);
2941     Assembler::ucomisd(dst, Address(rscratch1, 0));
2942   }
2943 }
2944 
2945 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
2946   if (reachable(src)) {
2947     Assembler::ucomiss(dst, as_Address(src));
2948   } else {
2949     lea(rscratch1, src);
2950     Assembler::ucomiss(dst, Address(rscratch1, 0));
2951   }
2952 }
2953 
2954 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
2955   // Used in sign-bit flipping with aligned address.
2956   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
2957   if (reachable(src)) {
2958     Assembler::xorpd(dst, as_Address(src));
2959   } else {
2960     lea(scratch_reg, src);
2961     Assembler::xorpd(dst, Address(scratch_reg, 0));
2962   }
2963 }
2964 
2965 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
2966   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
2967     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
2968   }
2969   else {
2970     Assembler::xorpd(dst, src);
2971   }
2972 }
2973 
2974 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
2975   if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
2976     Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
2977   } else {
2978     Assembler::xorps(dst, src);
2979   }
2980 }
2981 
2982 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg) {
2983   // Used in sign-bit flipping with aligned address.
2984   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
2985   if (reachable(src)) {
2986     Assembler::xorps(dst, as_Address(src));
2987   } else {
2988     lea(scratch_reg, src);
2989     Assembler::xorps(dst, Address(scratch_reg, 0));
2990   }
2991 }
2992 
2993 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
2994   // Used in sign-bit flipping with aligned address.
2995   bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
2996   assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
2997   if (reachable(src)) {
2998     Assembler::pshufb(dst, as_Address(src));
2999   } else {
3000     lea(rscratch1, src);
3001     Assembler::pshufb(dst, Address(rscratch1, 0));
3002   }
3003 }
3004 
3005 // AVX 3-operands instructions
3006 
3007 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3008   if (reachable(src)) {
3009     vaddsd(dst, nds, as_Address(src));
3010   } else {
3011     lea(rscratch1, src);
3012     vaddsd(dst, nds, Address(rscratch1, 0));
3013   }
3014 }
3015 
3016 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3017   if (reachable(src)) {
3018     vaddss(dst, nds, as_Address(src));
3019   } else {
3020     lea(rscratch1, src);
3021     vaddss(dst, nds, Address(rscratch1, 0));
3022   }
3023 }
3024 
3025 void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
3026   assert(UseAVX > 0, "requires some form of AVX");
3027   if (reachable(src)) {
3028     Assembler::vpaddd(dst, nds, as_Address(src), vector_len);
3029   } else {
3030     lea(rscratch, src);
3031     Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len);
3032   }
3033 }
3034 
3035 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3036   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3037   vandps(dst, nds, negate_field, vector_len);
3038 }
3039 
3040 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
3041   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3042   vandpd(dst, nds, negate_field, vector_len);
3043 }
3044 
3045 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3046   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3047   Assembler::vpaddb(dst, nds, src, vector_len);
3048 }
3049 
3050 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3051   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3052   Assembler::vpaddb(dst, nds, src, vector_len);
3053 }
3054 
3055 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3056   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3057   Assembler::vpaddw(dst, nds, src, vector_len);
3058 }
3059 
3060 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3061   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3062   Assembler::vpaddw(dst, nds, src, vector_len);
3063 }
3064 
3065 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3066   if (reachable(src)) {
3067     Assembler::vpand(dst, nds, as_Address(src), vector_len);
3068   } else {
3069     lea(scratch_reg, src);
3070     Assembler::vpand(dst, nds, Address(scratch_reg, 0), vector_len);
3071   }
3072 }
3073 
3074 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
3075   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3076   Assembler::vpbroadcastw(dst, src, vector_len);
3077 }
3078 
3079 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3080   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3081   Assembler::vpcmpeqb(dst, nds, src, vector_len);
3082 }
3083 
3084 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3085   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3086   Assembler::vpcmpeqw(dst, nds, src, vector_len);
3087 }
3088 
3089 void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds,
3090                                AddressLiteral src, int vector_len, Register scratch_reg) {
3091   if (reachable(src)) {
3092     Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len);
3093   } else {
3094     lea(scratch_reg, src);
3095     Assembler::evpcmpeqd(kdst, mask, nds, Address(scratch_reg, 0), vector_len);
3096   }
3097 }
3098 
3099 void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3100                              int comparison, int vector_len, Register scratch_reg) {
3101   if (reachable(src)) {
3102     Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, vector_len);
3103   } else {
3104     lea(scratch_reg, src);
3105     Assembler::evpcmpd(kdst, mask, nds, Address(scratch_reg, 0), comparison, vector_len);
3106   }
3107 }
3108 
3109 void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3110                              int comparison, int vector_len, Register scratch_reg) {
3111   if (reachable(src)) {
3112     Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, vector_len);
3113   } else {
3114     lea(scratch_reg, src);
3115     Assembler::evpcmpq(kdst, mask, nds, Address(scratch_reg, 0), comparison, vector_len);
3116   }
3117 }
3118 
3119 void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3120                              int comparison, int vector_len, Register scratch_reg) {
3121   if (reachable(src)) {
3122     Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, vector_len);
3123   } else {
3124     lea(scratch_reg, src);
3125     Assembler::evpcmpb(kdst, mask, nds, Address(scratch_reg, 0), comparison, vector_len);
3126   }
3127 }
3128 
3129 void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src,
3130                              int comparison, int vector_len, Register scratch_reg) {
3131   if (reachable(src)) {
3132     Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, vector_len);
3133   } else {
3134     lea(scratch_reg, src);
3135     Assembler::evpcmpw(kdst, mask, nds, Address(scratch_reg, 0), comparison, vector_len);
3136   }
3137 }
3138 
3139 void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) {
3140   if (width == Assembler::Q) {
3141     Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len);
3142   } else {
3143     Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len);
3144   }
3145 }
3146 
3147 void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, ComparisonPredicate cond, Width width, int vector_len, Register scratch_reg) {
3148   int eq_cond_enc = 0x29;
3149   int gt_cond_enc = 0x37;
3150   if (width != Assembler::Q) {
3151     eq_cond_enc = 0x74 + width;
3152     gt_cond_enc = 0x64 + width;
3153   }
3154   switch (cond) {
3155   case eq:
3156     vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3157     break;
3158   case neq:
3159     vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len);
3160     vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg);
3161     break;
3162   case le:
3163     vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3164     vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg);
3165     break;
3166   case nlt:
3167     vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3168     vpxor(dst, dst, ExternalAddress(StubRoutines::x86::vector_all_bits_set()), vector_len, scratch_reg);
3169     break;
3170   case lt:
3171     vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len);
3172     break;
3173   case nle:
3174     vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len);
3175     break;
3176   default:
3177     assert(false, "Should not reach here");
3178   }
3179 }
3180 
3181 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
3182   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3183   Assembler::vpmovzxbw(dst, src, vector_len);
3184 }
3185 
3186 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src) {
3187   assert((src->encoding() < 16),"XMM register should be 0-15");
3188   Assembler::vpmovmskb(dst, src);
3189 }
3190 
3191 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3192   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3193   Assembler::vpmullw(dst, nds, src, vector_len);
3194 }
3195 
3196 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3197   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3198   Assembler::vpmullw(dst, nds, src, vector_len);
3199 }
3200 
3201 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3202   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3203   Assembler::vpsubb(dst, nds, src, vector_len);
3204 }
3205 
3206 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3207   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3208   Assembler::vpsubb(dst, nds, src, vector_len);
3209 }
3210 
3211 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
3212   assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3213   Assembler::vpsubw(dst, nds, src, vector_len);
3214 }
3215 
3216 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
3217   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3218   Assembler::vpsubw(dst, nds, src, vector_len);
3219 }
3220 
3221 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3222   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3223   Assembler::vpsraw(dst, nds, shift, vector_len);
3224 }
3225 
3226 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3227   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3228   Assembler::vpsraw(dst, nds, shift, vector_len);
3229 }
3230 
3231 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3232   assert(UseAVX > 2,"");
3233   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3234      vector_len = 2;
3235   }
3236   Assembler::evpsraq(dst, nds, shift, vector_len);
3237 }
3238 
3239 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3240   assert(UseAVX > 2,"");
3241   if (!VM_Version::supports_avx512vl() && vector_len < 2) {
3242      vector_len = 2;
3243   }
3244   Assembler::evpsraq(dst, nds, shift, vector_len);
3245 }
3246 
3247 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3248   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3249   Assembler::vpsrlw(dst, nds, shift, vector_len);
3250 }
3251 
3252 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3253   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3254   Assembler::vpsrlw(dst, nds, shift, vector_len);
3255 }
3256 
3257 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
3258   assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3259   Assembler::vpsllw(dst, nds, shift, vector_len);
3260 }
3261 
3262 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
3263   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3264   Assembler::vpsllw(dst, nds, shift, vector_len);
3265 }
3266 
3267 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
3268   assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15");
3269   Assembler::vptest(dst, src);
3270 }
3271 
3272 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
3273   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3274   Assembler::punpcklbw(dst, src);
3275 }
3276 
3277 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) {
3278   assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15");
3279   Assembler::pshufd(dst, src, mode);
3280 }
3281 
3282 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
3283   assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
3284   Assembler::pshuflw(dst, src, mode);
3285 }
3286 
3287 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3288   if (reachable(src)) {
3289     vandpd(dst, nds, as_Address(src), vector_len);
3290   } else {
3291     lea(scratch_reg, src);
3292     vandpd(dst, nds, Address(scratch_reg, 0), vector_len);
3293   }
3294 }
3295 
3296 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3297   if (reachable(src)) {
3298     vandps(dst, nds, as_Address(src), vector_len);
3299   } else {
3300     lea(scratch_reg, src);
3301     vandps(dst, nds, Address(scratch_reg, 0), vector_len);
3302   }
3303 }
3304 
3305 void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src,
3306                             bool merge, int vector_len, Register scratch_reg) {
3307   if (reachable(src)) {
3308     Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len);
3309   } else {
3310     lea(scratch_reg, src);
3311     Assembler::evpord(dst, mask, nds, Address(scratch_reg, 0), merge, vector_len);
3312   }
3313 }
3314 
3315 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3316   if (reachable(src)) {
3317     vdivsd(dst, nds, as_Address(src));
3318   } else {
3319     lea(rscratch1, src);
3320     vdivsd(dst, nds, Address(rscratch1, 0));
3321   }
3322 }
3323 
3324 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3325   if (reachable(src)) {
3326     vdivss(dst, nds, as_Address(src));
3327   } else {
3328     lea(rscratch1, src);
3329     vdivss(dst, nds, Address(rscratch1, 0));
3330   }
3331 }
3332 
3333 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3334   if (reachable(src)) {
3335     vmulsd(dst, nds, as_Address(src));
3336   } else {
3337     lea(rscratch1, src);
3338     vmulsd(dst, nds, Address(rscratch1, 0));
3339   }
3340 }
3341 
3342 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3343   if (reachable(src)) {
3344     vmulss(dst, nds, as_Address(src));
3345   } else {
3346     lea(rscratch1, src);
3347     vmulss(dst, nds, Address(rscratch1, 0));
3348   }
3349 }
3350 
3351 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3352   if (reachable(src)) {
3353     vsubsd(dst, nds, as_Address(src));
3354   } else {
3355     lea(rscratch1, src);
3356     vsubsd(dst, nds, Address(rscratch1, 0));
3357   }
3358 }
3359 
3360 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3361   if (reachable(src)) {
3362     vsubss(dst, nds, as_Address(src));
3363   } else {
3364     lea(rscratch1, src);
3365     vsubss(dst, nds, Address(rscratch1, 0));
3366   }
3367 }
3368 
3369 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3370   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3371   vxorps(dst, nds, src, Assembler::AVX_128bit);
3372 }
3373 
3374 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
3375   assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15");
3376   vxorpd(dst, nds, src, Assembler::AVX_128bit);
3377 }
3378 
3379 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3380   if (reachable(src)) {
3381     vxorpd(dst, nds, as_Address(src), vector_len);
3382   } else {
3383     lea(scratch_reg, src);
3384     vxorpd(dst, nds, Address(scratch_reg, 0), vector_len);
3385   }
3386 }
3387 
3388 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3389   if (reachable(src)) {
3390     vxorps(dst, nds, as_Address(src), vector_len);
3391   } else {
3392     lea(scratch_reg, src);
3393     vxorps(dst, nds, Address(scratch_reg, 0), vector_len);
3394   }
3395 }
3396 
3397 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3398   if (UseAVX > 1 || (vector_len < 1)) {
3399     if (reachable(src)) {
3400       Assembler::vpxor(dst, nds, as_Address(src), vector_len);
3401     } else {
3402       lea(scratch_reg, src);
3403       Assembler::vpxor(dst, nds, Address(scratch_reg, 0), vector_len);
3404     }
3405   }
3406   else {
3407     MacroAssembler::vxorpd(dst, nds, src, vector_len, scratch_reg);
3408   }
3409 }
3410 
3411 void MacroAssembler::vpermd(XMMRegister dst,  XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3412   if (reachable(src)) {
3413     Assembler::vpermd(dst, nds, as_Address(src), vector_len);
3414   } else {
3415     lea(scratch_reg, src);
3416     Assembler::vpermd(dst, nds, Address(scratch_reg, 0), vector_len);
3417   }
3418 }
3419 
3420 void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
3421   const int32_t inverted_jweak_mask = ~static_cast<int32_t>(JNIHandles::weak_tag_mask);
3422   STATIC_ASSERT(inverted_jweak_mask == -2); // otherwise check this code
3423   // The inverted mask is sign-extended
3424   andptr(possibly_jweak, inverted_jweak_mask);
3425 }
3426 
3427 void MacroAssembler::resolve_jobject(Register value,
3428                                      Register thread,
3429                                      Register tmp) {
3430   assert_different_registers(value, thread, tmp);
3431   Label done, not_weak;
3432   testptr(value, value);
3433   jcc(Assembler::zero, done);                // Use NULL as-is.
3434   testptr(value, JNIHandles::weak_tag_mask); // Test for jweak tag.
3435   jcc(Assembler::zero, not_weak);
3436   // Resolve jweak.
3437   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3438                  value, Address(value, -JNIHandles::weak_tag_value), tmp, thread);
3439   verify_oop(value);
3440   jmp(done);
3441   bind(not_weak);
3442   // Resolve (untagged) jobject.
3443   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
3444   verify_oop(value);
3445   bind(done);
3446 }
3447 
3448 void MacroAssembler::subptr(Register dst, int32_t imm32) {
3449   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
3450 }
3451 
3452 // Force generation of a 4 byte immediate value even if it fits into 8bit
3453 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
3454   LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
3455 }
3456 
3457 void MacroAssembler::subptr(Register dst, Register src) {
3458   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
3459 }
3460 
3461 // C++ bool manipulation
3462 void MacroAssembler::testbool(Register dst) {
3463   if(sizeof(bool) == 1)
3464     testb(dst, 0xff);
3465   else if(sizeof(bool) == 2) {
3466     // testw implementation needed for two byte bools
3467     ShouldNotReachHere();
3468   } else if(sizeof(bool) == 4)
3469     testl(dst, dst);
3470   else
3471     // unsupported
3472     ShouldNotReachHere();
3473 }
3474 
3475 void MacroAssembler::testptr(Register dst, Register src) {
3476   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
3477 }
3478 
3479 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3480 void MacroAssembler::tlab_allocate(Register thread, Register obj,
3481                                    Register var_size_in_bytes,
3482                                    int con_size_in_bytes,
3483                                    Register t1,
3484                                    Register t2,
3485                                    Label& slow_case) {
3486   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3487   bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
3488 }
3489 
3490 // Defines obj, preserves var_size_in_bytes
3491 void MacroAssembler::eden_allocate(Register thread, Register obj,
3492                                    Register var_size_in_bytes,
3493                                    int con_size_in_bytes,
3494                                    Register t1,
3495                                    Label& slow_case) {
3496   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3497   bs->eden_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
3498 }
3499 
3500 // Preserves the contents of address, destroys the contents length_in_bytes and temp.
3501 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
3502   assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
3503   assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
3504   Label done;
3505 
3506   testptr(length_in_bytes, length_in_bytes);
3507   jcc(Assembler::zero, done);
3508 
3509   // initialize topmost word, divide index by 2, check if odd and test if zero
3510   // note: for the remaining code to work, index must be a multiple of BytesPerWord
3511 #ifdef ASSERT
3512   {
3513     Label L;
3514     testptr(length_in_bytes, BytesPerWord - 1);
3515     jcc(Assembler::zero, L);
3516     stop("length must be a multiple of BytesPerWord");
3517     bind(L);
3518   }
3519 #endif
3520   Register index = length_in_bytes;
3521   xorptr(temp, temp);    // use _zero reg to clear memory (shorter code)
3522   if (UseIncDec) {
3523     shrptr(index, 3);  // divide by 8/16 and set carry flag if bit 2 was set
3524   } else {
3525     shrptr(index, 2);  // use 2 instructions to avoid partial flag stall
3526     shrptr(index, 1);
3527   }
3528 #ifndef _LP64
3529   // index could have not been a multiple of 8 (i.e., bit 2 was set)
3530   {
3531     Label even;
3532     // note: if index was a multiple of 8, then it cannot
3533     //       be 0 now otherwise it must have been 0 before
3534     //       => if it is even, we don't need to check for 0 again
3535     jcc(Assembler::carryClear, even);
3536     // clear topmost word (no jump would be needed if conditional assignment worked here)
3537     movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp);
3538     // index could be 0 now, must check again
3539     jcc(Assembler::zero, done);
3540     bind(even);
3541   }
3542 #endif // !_LP64
3543   // initialize remaining object fields: index is a multiple of 2 now
3544   {
3545     Label loop;
3546     bind(loop);
3547     movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
3548     NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
3549     decrement(index);
3550     jcc(Assembler::notZero, loop);
3551   }
3552 
3553   bind(done);
3554 }
3555 
3556 // Look up the method for a megamorphic invokeinterface call.
3557 // The target method is determined by <intf_klass, itable_index>.
3558 // The receiver klass is in recv_klass.
3559 // On success, the result will be in method_result, and execution falls through.
3560 // On failure, execution transfers to the given label.
3561 void MacroAssembler::lookup_interface_method(Register recv_klass,
3562                                              Register intf_klass,
3563                                              RegisterOrConstant itable_index,
3564                                              Register method_result,
3565                                              Register scan_temp,
3566                                              Label& L_no_such_interface,
3567                                              bool return_method) {
3568   assert_different_registers(recv_klass, intf_klass, scan_temp);
3569   assert_different_registers(method_result, intf_klass, scan_temp);
3570   assert(recv_klass != method_result || !return_method,
3571          "recv_klass can be destroyed when method isn't needed");
3572 
3573   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
3574          "caller must use same register for non-constant itable index as for method");
3575 
3576   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
3577   int vtable_base = in_bytes(Klass::vtable_start_offset());
3578   int itentry_off = itableMethodEntry::method_offset_in_bytes();
3579   int scan_step   = itableOffsetEntry::size() * wordSize;
3580   int vte_size    = vtableEntry::size_in_bytes();
3581   Address::ScaleFactor times_vte_scale = Address::times_ptr;
3582   assert(vte_size == wordSize, "else adjust times_vte_scale");
3583 
3584   movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3585 
3586   // %%% Could store the aligned, prescaled offset in the klassoop.
3587   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
3588 
3589   if (return_method) {
3590     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
3591     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3592     lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
3593   }
3594 
3595   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
3596   //   if (scan->interface() == intf) {
3597   //     result = (klass + scan->offset() + itable_index);
3598   //   }
3599   // }
3600   Label search, found_method;
3601 
3602   for (int peel = 1; peel >= 0; peel--) {
3603     movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
3604     cmpptr(intf_klass, method_result);
3605 
3606     if (peel) {
3607       jccb(Assembler::equal, found_method);
3608     } else {
3609       jccb(Assembler::notEqual, search);
3610       // (invert the test to fall through to found_method...)
3611     }
3612 
3613     if (!peel)  break;
3614 
3615     bind(search);
3616 
3617     // Check that the previous entry is non-null.  A null entry means that
3618     // the receiver class doesn't implement the interface, and wasn't the
3619     // same as when the caller was compiled.
3620     testptr(method_result, method_result);
3621     jcc(Assembler::zero, L_no_such_interface);
3622     addptr(scan_temp, scan_step);
3623   }
3624 
3625   bind(found_method);
3626 
3627   if (return_method) {
3628     // Got a hit.
3629     movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
3630     movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
3631   }
3632 }
3633 
3634 
3635 // virtual method calling
3636 void MacroAssembler::lookup_virtual_method(Register recv_klass,
3637                                            RegisterOrConstant vtable_index,
3638                                            Register method_result) {
3639   const int base = in_bytes(Klass::vtable_start_offset());
3640   assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
3641   Address vtable_entry_addr(recv_klass,
3642                             vtable_index, Address::times_ptr,
3643                             base + vtableEntry::method_offset_in_bytes());
3644   movptr(method_result, vtable_entry_addr);
3645 }
3646 
3647 
3648 void MacroAssembler::check_klass_subtype(Register sub_klass,
3649                            Register super_klass,
3650                            Register temp_reg,
3651                            Label& L_success) {
3652   Label L_failure;
3653   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
3654   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
3655   bind(L_failure);
3656 }
3657 
3658 
3659 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
3660                                                    Register super_klass,
3661                                                    Register temp_reg,
3662                                                    Label* L_success,
3663                                                    Label* L_failure,
3664                                                    Label* L_slow_path,
3665                                         RegisterOrConstant super_check_offset) {
3666   assert_different_registers(sub_klass, super_klass, temp_reg);
3667   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
3668   if (super_check_offset.is_register()) {
3669     assert_different_registers(sub_klass, super_klass,
3670                                super_check_offset.as_register());
3671   } else if (must_load_sco) {
3672     assert(temp_reg != noreg, "supply either a temp or a register offset");
3673   }
3674 
3675   Label L_fallthrough;
3676   int label_nulls = 0;
3677   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
3678   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
3679   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
3680   assert(label_nulls <= 1, "at most one NULL in the batch");
3681 
3682   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3683   int sco_offset = in_bytes(Klass::super_check_offset_offset());
3684   Address super_check_offset_addr(super_klass, sco_offset);
3685 
3686   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
3687   // range of a jccb.  If this routine grows larger, reconsider at
3688   // least some of these.
3689 #define local_jcc(assembler_cond, label)                                \
3690   if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
3691   else                             jcc( assembler_cond, label) /*omit semi*/
3692 
3693   // Hacked jmp, which may only be used just before L_fallthrough.
3694 #define final_jmp(label)                                                \
3695   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
3696   else                            jmp(label)                /*omit semi*/
3697 
3698   // If the pointers are equal, we are done (e.g., String[] elements).
3699   // This self-check enables sharing of secondary supertype arrays among
3700   // non-primary types such as array-of-interface.  Otherwise, each such
3701   // type would need its own customized SSA.
3702   // We move this check to the front of the fast path because many
3703   // type checks are in fact trivially successful in this manner,
3704   // so we get a nicely predicted branch right at the start of the check.
3705   cmpptr(sub_klass, super_klass);
3706   local_jcc(Assembler::equal, *L_success);
3707 
3708   // Check the supertype display:
3709   if (must_load_sco) {
3710     // Positive movl does right thing on LP64.
3711     movl(temp_reg, super_check_offset_addr);
3712     super_check_offset = RegisterOrConstant(temp_reg);
3713   }
3714   Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
3715   cmpptr(super_klass, super_check_addr); // load displayed supertype
3716 
3717   // This check has worked decisively for primary supers.
3718   // Secondary supers are sought in the super_cache ('super_cache_addr').
3719   // (Secondary supers are interfaces and very deeply nested subtypes.)
3720   // This works in the same check above because of a tricky aliasing
3721   // between the super_cache and the primary super display elements.
3722   // (The 'super_check_addr' can address either, as the case requires.)
3723   // Note that the cache is updated below if it does not help us find
3724   // what we need immediately.
3725   // So if it was a primary super, we can just fail immediately.
3726   // Otherwise, it's the slow path for us (no success at this point).
3727 
3728   if (super_check_offset.is_register()) {
3729     local_jcc(Assembler::equal, *L_success);
3730     cmpl(super_check_offset.as_register(), sc_offset);
3731     if (L_failure == &L_fallthrough) {
3732       local_jcc(Assembler::equal, *L_slow_path);
3733     } else {
3734       local_jcc(Assembler::notEqual, *L_failure);
3735       final_jmp(*L_slow_path);
3736     }
3737   } else if (super_check_offset.as_constant() == sc_offset) {
3738     // Need a slow path; fast failure is impossible.
3739     if (L_slow_path == &L_fallthrough) {
3740       local_jcc(Assembler::equal, *L_success);
3741     } else {
3742       local_jcc(Assembler::notEqual, *L_slow_path);
3743       final_jmp(*L_success);
3744     }
3745   } else {
3746     // No slow path; it's a fast decision.
3747     if (L_failure == &L_fallthrough) {
3748       local_jcc(Assembler::equal, *L_success);
3749     } else {
3750       local_jcc(Assembler::notEqual, *L_failure);
3751       final_jmp(*L_success);
3752     }
3753   }
3754 
3755   bind(L_fallthrough);
3756 
3757 #undef local_jcc
3758 #undef final_jmp
3759 }
3760 
3761 
3762 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
3763                                                    Register super_klass,
3764                                                    Register temp_reg,
3765                                                    Register temp2_reg,
3766                                                    Label* L_success,
3767                                                    Label* L_failure,
3768                                                    bool set_cond_codes) {
3769   assert_different_registers(sub_klass, super_klass, temp_reg);
3770   if (temp2_reg != noreg)
3771     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
3772 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
3773 
3774   Label L_fallthrough;
3775   int label_nulls = 0;
3776   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
3777   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
3778   assert(label_nulls <= 1, "at most one NULL in the batch");
3779 
3780   // a couple of useful fields in sub_klass:
3781   int ss_offset = in_bytes(Klass::secondary_supers_offset());
3782   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3783   Address secondary_supers_addr(sub_klass, ss_offset);
3784   Address super_cache_addr(     sub_klass, sc_offset);
3785 
3786   // Do a linear scan of the secondary super-klass chain.
3787   // This code is rarely used, so simplicity is a virtue here.
3788   // The repne_scan instruction uses fixed registers, which we must spill.
3789   // Don't worry too much about pre-existing connections with the input regs.
3790 
3791   assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
3792   assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
3793 
3794   // Get super_klass value into rax (even if it was in rdi or rcx).
3795   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
3796   if (super_klass != rax || UseCompressedOops) {
3797     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
3798     mov(rax, super_klass);
3799   }
3800   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
3801   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
3802 
3803 #ifndef PRODUCT
3804   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
3805   ExternalAddress pst_counter_addr((address) pst_counter);
3806   NOT_LP64(  incrementl(pst_counter_addr) );
3807   LP64_ONLY( lea(rcx, pst_counter_addr) );
3808   LP64_ONLY( incrementl(Address(rcx, 0)) );
3809 #endif //PRODUCT
3810 
3811   // We will consult the secondary-super array.
3812   movptr(rdi, secondary_supers_addr);
3813   // Load the array length.  (Positive movl does right thing on LP64.)
3814   movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
3815   // Skip to start of data.
3816   addptr(rdi, Array<Klass*>::base_offset_in_bytes());
3817 
3818   // Scan RCX words at [RDI] for an occurrence of RAX.
3819   // Set NZ/Z based on last compare.
3820   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
3821   // not change flags (only scas instruction which is repeated sets flags).
3822   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
3823 
3824     testptr(rax,rax); // Set Z = 0
3825     repne_scan();
3826 
3827   // Unspill the temp. registers:
3828   if (pushed_rdi)  pop(rdi);
3829   if (pushed_rcx)  pop(rcx);
3830   if (pushed_rax)  pop(rax);
3831 
3832   if (set_cond_codes) {
3833     // Special hack for the AD files:  rdi is guaranteed non-zero.
3834     assert(!pushed_rdi, "rdi must be left non-NULL");
3835     // Also, the condition codes are properly set Z/NZ on succeed/failure.
3836   }
3837 
3838   if (L_failure == &L_fallthrough)
3839         jccb(Assembler::notEqual, *L_failure);
3840   else  jcc(Assembler::notEqual, *L_failure);
3841 
3842   // Success.  Cache the super we found and proceed in triumph.
3843   movptr(super_cache_addr, super_klass);
3844 
3845   if (L_success != &L_fallthrough) {
3846     jmp(*L_success);
3847   }
3848 
3849 #undef IS_A_TEMP
3850 
3851   bind(L_fallthrough);
3852 }
3853 
3854 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
3855   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
3856 
3857   Label L_fallthrough;
3858   if (L_fast_path == NULL) {
3859     L_fast_path = &L_fallthrough;
3860   } else if (L_slow_path == NULL) {
3861     L_slow_path = &L_fallthrough;
3862   }
3863 
3864   // Fast path check: class is fully initialized
3865   cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
3866   jcc(Assembler::equal, *L_fast_path);
3867 
3868   // Fast path check: current thread is initializer thread
3869   cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset()));
3870   if (L_slow_path == &L_fallthrough) {
3871     jcc(Assembler::equal, *L_fast_path);
3872     bind(*L_slow_path);
3873   } else if (L_fast_path == &L_fallthrough) {
3874     jcc(Assembler::notEqual, *L_slow_path);
3875     bind(*L_fast_path);
3876   } else {
3877     Unimplemented();
3878   }
3879 }
3880 
3881 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
3882   if (VM_Version::supports_cmov()) {
3883     cmovl(cc, dst, src);
3884   } else {
3885     Label L;
3886     jccb(negate_condition(cc), L);
3887     movl(dst, src);
3888     bind(L);
3889   }
3890 }
3891 
3892 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
3893   if (VM_Version::supports_cmov()) {
3894     cmovl(cc, dst, src);
3895   } else {
3896     Label L;
3897     jccb(negate_condition(cc), L);
3898     movl(dst, src);
3899     bind(L);
3900   }
3901 }
3902 
3903 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
3904   if (!VerifyOops) return;
3905 
3906   // Pass register number to verify_oop_subroutine
3907   const char* b = NULL;
3908   {
3909     ResourceMark rm;
3910     stringStream ss;
3911     ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
3912     b = code_string(ss.as_string());
3913   }
3914   BLOCK_COMMENT("verify_oop {");
3915 #ifdef _LP64
3916   push(rscratch1);                    // save r10, trashed by movptr()
3917 #endif
3918   push(rax);                          // save rax,
3919   push(reg);                          // pass register argument
3920   ExternalAddress buffer((address) b);
3921   // avoid using pushptr, as it modifies scratch registers
3922   // and our contract is not to modify anything
3923   movptr(rax, buffer.addr());
3924   push(rax);
3925   // call indirectly to solve generation ordering problem
3926   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
3927   call(rax);
3928   // Caller pops the arguments (oop, message) and restores rax, r10
3929   BLOCK_COMMENT("} verify_oop");
3930 }
3931 
3932 void MacroAssembler::vallones(XMMRegister dst, int vector_len) {
3933   if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
3934     vpternlogd(dst, 0xFF, dst, dst, vector_len);
3935   } else {
3936     assert(UseAVX > 0, "");
3937     vpcmpeqb(dst, dst, dst, vector_len);
3938   }
3939 }
3940 
3941 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
3942                                                       Register tmp,
3943                                                       int offset) {
3944   intptr_t value = *delayed_value_addr;
3945   if (value != 0)
3946     return RegisterOrConstant(value + offset);
3947 
3948   // load indirectly to solve generation ordering problem
3949   movptr(tmp, ExternalAddress((address) delayed_value_addr));
3950 
3951 #ifdef ASSERT
3952   { Label L;
3953     testptr(tmp, tmp);
3954     if (WizardMode) {
3955       const char* buf = NULL;
3956       {
3957         ResourceMark rm;
3958         stringStream ss;
3959         ss.print("DelayedValue=" INTPTR_FORMAT, delayed_value_addr[1]);
3960         buf = code_string(ss.as_string());
3961       }
3962       jcc(Assembler::notZero, L);
3963       STOP(buf);
3964     } else {
3965       jccb(Assembler::notZero, L);
3966       hlt();
3967     }
3968     bind(L);
3969   }
3970 #endif
3971 
3972   if (offset != 0)
3973     addptr(tmp, offset);
3974 
3975   return RegisterOrConstant(tmp);
3976 }
3977 
3978 
3979 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
3980                                          int extra_slot_offset) {
3981   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
3982   int stackElementSize = Interpreter::stackElementSize;
3983   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
3984 #ifdef ASSERT
3985   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
3986   assert(offset1 - offset == stackElementSize, "correct arithmetic");
3987 #endif
3988   Register             scale_reg    = noreg;
3989   Address::ScaleFactor scale_factor = Address::no_scale;
3990   if (arg_slot.is_constant()) {
3991     offset += arg_slot.as_constant() * stackElementSize;
3992   } else {
3993     scale_reg    = arg_slot.as_register();
3994     scale_factor = Address::times(stackElementSize);
3995   }
3996   offset += wordSize;           // return PC is on stack
3997   return Address(rsp, scale_reg, scale_factor, offset);
3998 }
3999 
4000 
4001 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
4002   if (!VerifyOops) return;
4003 
4004   // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
4005   // Pass register number to verify_oop_subroutine
4006   const char* b = NULL;
4007   {
4008     ResourceMark rm;
4009     stringStream ss;
4010     ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
4011     b = code_string(ss.as_string());
4012   }
4013 #ifdef _LP64
4014   push(rscratch1);                    // save r10, trashed by movptr()
4015 #endif
4016   push(rax);                          // save rax,
4017   // addr may contain rsp so we will have to adjust it based on the push
4018   // we just did (and on 64 bit we do two pushes)
4019   // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
4020   // stores rax into addr which is backwards of what was intended.
4021   if (addr.uses(rsp)) {
4022     lea(rax, addr);
4023     pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
4024   } else {
4025     pushptr(addr);
4026   }
4027 
4028   ExternalAddress buffer((address) b);
4029   // pass msg argument
4030   // avoid using pushptr, as it modifies scratch registers
4031   // and our contract is not to modify anything
4032   movptr(rax, buffer.addr());
4033   push(rax);
4034 
4035   // call indirectly to solve generation ordering problem
4036   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
4037   call(rax);
4038   // Caller pops the arguments (addr, message) and restores rax, r10.
4039 }
4040 
4041 void MacroAssembler::verify_tlab() {
4042 #ifdef ASSERT
4043   if (UseTLAB && VerifyOops) {
4044     Label next, ok;
4045     Register t1 = rsi;
4046     Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
4047 
4048     push(t1);
4049     NOT_LP64(push(thread_reg));
4050     NOT_LP64(get_thread(thread_reg));
4051 
4052     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4053     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
4054     jcc(Assembler::aboveEqual, next);
4055     STOP("assert(top >= start)");
4056     should_not_reach_here();
4057 
4058     bind(next);
4059     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
4060     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4061     jcc(Assembler::aboveEqual, ok);
4062     STOP("assert(top <= end)");
4063     should_not_reach_here();
4064 
4065     bind(ok);
4066     NOT_LP64(pop(thread_reg));
4067     pop(t1);
4068   }
4069 #endif
4070 }
4071 
4072 class ControlWord {
4073  public:
4074   int32_t _value;
4075 
4076   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
4077   int  precision_control() const       { return  (_value >>  8) & 3      ; }
4078   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4079   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4080   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4081   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4082   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4083   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4084 
4085   void print() const {
4086     // rounding control
4087     const char* rc;
4088     switch (rounding_control()) {
4089       case 0: rc = "round near"; break;
4090       case 1: rc = "round down"; break;
4091       case 2: rc = "round up  "; break;
4092       case 3: rc = "chop      "; break;
4093     };
4094     // precision control
4095     const char* pc;
4096     switch (precision_control()) {
4097       case 0: pc = "24 bits "; break;
4098       case 1: pc = "reserved"; break;
4099       case 2: pc = "53 bits "; break;
4100       case 3: pc = "64 bits "; break;
4101     };
4102     // flags
4103     char f[9];
4104     f[0] = ' ';
4105     f[1] = ' ';
4106     f[2] = (precision   ()) ? 'P' : 'p';
4107     f[3] = (underflow   ()) ? 'U' : 'u';
4108     f[4] = (overflow    ()) ? 'O' : 'o';
4109     f[5] = (zero_divide ()) ? 'Z' : 'z';
4110     f[6] = (denormalized()) ? 'D' : 'd';
4111     f[7] = (invalid     ()) ? 'I' : 'i';
4112     f[8] = '\x0';
4113     // output
4114     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
4115   }
4116 
4117 };
4118 
4119 class StatusWord {
4120  public:
4121   int32_t _value;
4122 
4123   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
4124   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
4125   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
4126   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
4127   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
4128   int  top() const                     { return  (_value >> 11) & 7      ; }
4129   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
4130   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
4131   bool precision() const               { return ((_value >>  5) & 1) != 0; }
4132   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
4133   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
4134   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
4135   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
4136   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
4137 
4138   void print() const {
4139     // condition codes
4140     char c[5];
4141     c[0] = (C3()) ? '3' : '-';
4142     c[1] = (C2()) ? '2' : '-';
4143     c[2] = (C1()) ? '1' : '-';
4144     c[3] = (C0()) ? '0' : '-';
4145     c[4] = '\x0';
4146     // flags
4147     char f[9];
4148     f[0] = (error_status()) ? 'E' : '-';
4149     f[1] = (stack_fault ()) ? 'S' : '-';
4150     f[2] = (precision   ()) ? 'P' : '-';
4151     f[3] = (underflow   ()) ? 'U' : '-';
4152     f[4] = (overflow    ()) ? 'O' : '-';
4153     f[5] = (zero_divide ()) ? 'Z' : '-';
4154     f[6] = (denormalized()) ? 'D' : '-';
4155     f[7] = (invalid     ()) ? 'I' : '-';
4156     f[8] = '\x0';
4157     // output
4158     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
4159   }
4160 
4161 };
4162 
4163 class TagWord {
4164  public:
4165   int32_t _value;
4166 
4167   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
4168 
4169   void print() const {
4170     printf("%04x", _value & 0xFFFF);
4171   }
4172 
4173 };
4174 
4175 class FPU_Register {
4176  public:
4177   int32_t _m0;
4178   int32_t _m1;
4179   int16_t _ex;
4180 
4181   bool is_indefinite() const           {
4182     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
4183   }
4184 
4185   void print() const {
4186     char  sign = (_ex < 0) ? '-' : '+';
4187     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
4188     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
4189   };
4190 
4191 };
4192 
4193 class FPU_State {
4194  public:
4195   enum {
4196     register_size       = 10,
4197     number_of_registers =  8,
4198     register_mask       =  7
4199   };
4200 
4201   ControlWord  _control_word;
4202   StatusWord   _status_word;
4203   TagWord      _tag_word;
4204   int32_t      _error_offset;
4205   int32_t      _error_selector;
4206   int32_t      _data_offset;
4207   int32_t      _data_selector;
4208   int8_t       _register[register_size * number_of_registers];
4209 
4210   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
4211   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
4212 
4213   const char* tag_as_string(int tag) const {
4214     switch (tag) {
4215       case 0: return "valid";
4216       case 1: return "zero";
4217       case 2: return "special";
4218       case 3: return "empty";
4219     }
4220     ShouldNotReachHere();
4221     return NULL;
4222   }
4223 
4224   void print() const {
4225     // print computation registers
4226     { int t = _status_word.top();
4227       for (int i = 0; i < number_of_registers; i++) {
4228         int j = (i - t) & register_mask;
4229         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
4230         st(j)->print();
4231         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
4232       }
4233     }
4234     printf("\n");
4235     // print control registers
4236     printf("ctrl = "); _control_word.print(); printf("\n");
4237     printf("stat = "); _status_word .print(); printf("\n");
4238     printf("tags = "); _tag_word    .print(); printf("\n");
4239   }
4240 
4241 };
4242 
4243 class Flag_Register {
4244  public:
4245   int32_t _value;
4246 
4247   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
4248   bool direction() const               { return ((_value >> 10) & 1) != 0; }
4249   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
4250   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
4251   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
4252   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
4253   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
4254 
4255   void print() const {
4256     // flags
4257     char f[8];
4258     f[0] = (overflow       ()) ? 'O' : '-';
4259     f[1] = (direction      ()) ? 'D' : '-';
4260     f[2] = (sign           ()) ? 'S' : '-';
4261     f[3] = (zero           ()) ? 'Z' : '-';
4262     f[4] = (auxiliary_carry()) ? 'A' : '-';
4263     f[5] = (parity         ()) ? 'P' : '-';
4264     f[6] = (carry          ()) ? 'C' : '-';
4265     f[7] = '\x0';
4266     // output
4267     printf("%08x  flags = %s", _value, f);
4268   }
4269 
4270 };
4271 
4272 class IU_Register {
4273  public:
4274   int32_t _value;
4275 
4276   void print() const {
4277     printf("%08x  %11d", _value, _value);
4278   }
4279 
4280 };
4281 
4282 class IU_State {
4283  public:
4284   Flag_Register _eflags;
4285   IU_Register   _rdi;
4286   IU_Register   _rsi;
4287   IU_Register   _rbp;
4288   IU_Register   _rsp;
4289   IU_Register   _rbx;
4290   IU_Register   _rdx;
4291   IU_Register   _rcx;
4292   IU_Register   _rax;
4293 
4294   void print() const {
4295     // computation registers
4296     printf("rax,  = "); _rax.print(); printf("\n");
4297     printf("rbx,  = "); _rbx.print(); printf("\n");
4298     printf("rcx  = "); _rcx.print(); printf("\n");
4299     printf("rdx  = "); _rdx.print(); printf("\n");
4300     printf("rdi  = "); _rdi.print(); printf("\n");
4301     printf("rsi  = "); _rsi.print(); printf("\n");
4302     printf("rbp,  = "); _rbp.print(); printf("\n");
4303     printf("rsp  = "); _rsp.print(); printf("\n");
4304     printf("\n");
4305     // control registers
4306     printf("flgs = "); _eflags.print(); printf("\n");
4307   }
4308 };
4309 
4310 
4311 class CPU_State {
4312  public:
4313   FPU_State _fpu_state;
4314   IU_State  _iu_state;
4315 
4316   void print() const {
4317     printf("--------------------------------------------------\n");
4318     _iu_state .print();
4319     printf("\n");
4320     _fpu_state.print();
4321     printf("--------------------------------------------------\n");
4322   }
4323 
4324 };
4325 
4326 
4327 static void _print_CPU_state(CPU_State* state) {
4328   state->print();
4329 };
4330 
4331 
4332 void MacroAssembler::print_CPU_state() {
4333   push_CPU_state();
4334   push(rsp);                // pass CPU state
4335   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
4336   addptr(rsp, wordSize);       // discard argument
4337   pop_CPU_state();
4338 }
4339 
4340 
4341 #ifndef _LP64
4342 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
4343   static int counter = 0;
4344   FPU_State* fs = &state->_fpu_state;
4345   counter++;
4346   // For leaf calls, only verify that the top few elements remain empty.
4347   // We only need 1 empty at the top for C2 code.
4348   if( stack_depth < 0 ) {
4349     if( fs->tag_for_st(7) != 3 ) {
4350       printf("FPR7 not empty\n");
4351       state->print();
4352       assert(false, "error");
4353       return false;
4354     }
4355     return true;                // All other stack states do not matter
4356   }
4357 
4358   assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
4359          "bad FPU control word");
4360 
4361   // compute stack depth
4362   int i = 0;
4363   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
4364   int d = i;
4365   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
4366   // verify findings
4367   if (i != FPU_State::number_of_registers) {
4368     // stack not contiguous
4369     printf("%s: stack not contiguous at ST%d\n", s, i);
4370     state->print();
4371     assert(false, "error");
4372     return false;
4373   }
4374   // check if computed stack depth corresponds to expected stack depth
4375   if (stack_depth < 0) {
4376     // expected stack depth is -stack_depth or less
4377     if (d > -stack_depth) {
4378       // too many elements on the stack
4379       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
4380       state->print();
4381       assert(false, "error");
4382       return false;
4383     }
4384   } else {
4385     // expected stack depth is stack_depth
4386     if (d != stack_depth) {
4387       // wrong stack depth
4388       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
4389       state->print();
4390       assert(false, "error");
4391       return false;
4392     }
4393   }
4394   // everything is cool
4395   return true;
4396 }
4397 
4398 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
4399   if (!VerifyFPU) return;
4400   push_CPU_state();
4401   push(rsp);                // pass CPU state
4402   ExternalAddress msg((address) s);
4403   // pass message string s
4404   pushptr(msg.addr());
4405   push(stack_depth);        // pass stack depth
4406   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
4407   addptr(rsp, 3 * wordSize);   // discard arguments
4408   // check for error
4409   { Label L;
4410     testl(rax, rax);
4411     jcc(Assembler::notZero, L);
4412     int3();                  // break if error condition
4413     bind(L);
4414   }
4415   pop_CPU_state();
4416 }
4417 #endif // _LP64
4418 
4419 void MacroAssembler::restore_cpu_control_state_after_jni() {
4420   // Either restore the MXCSR register after returning from the JNI Call
4421   // or verify that it wasn't changed (with -Xcheck:jni flag).
4422   if (VM_Version::supports_sse()) {
4423     if (RestoreMXCSROnJNICalls) {
4424       ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
4425     } else if (CheckJNICalls) {
4426       call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
4427     }
4428   }
4429   // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
4430   vzeroupper();
4431   // Reset k1 to 0xffff.
4432 
4433 #ifdef COMPILER2
4434   if (PostLoopMultiversioning && VM_Version::supports_evex()) {
4435     push(rcx);
4436     movl(rcx, 0xffff);
4437     kmovwl(k1, rcx);
4438     pop(rcx);
4439   }
4440 #endif // COMPILER2
4441 
4442 #ifndef _LP64
4443   // Either restore the x87 floating pointer control word after returning
4444   // from the JNI call or verify that it wasn't changed.
4445   if (CheckJNICalls) {
4446     call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
4447   }
4448 #endif // _LP64
4449 }
4450 
4451 // ((OopHandle)result).resolve();
4452 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
4453   assert_different_registers(result, tmp);
4454 
4455   // Only 64 bit platforms support GCs that require a tmp register
4456   // Only IN_HEAP loads require a thread_tmp register
4457   // OopHandle::resolve is an indirection like jobject.
4458   access_load_at(T_OBJECT, IN_NATIVE,
4459                  result, Address(result, 0), tmp, /*tmp_thread*/noreg);
4460 }
4461 
4462 // ((WeakHandle)result).resolve();
4463 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
4464   assert_different_registers(rresult, rtmp);
4465   Label resolved;
4466 
4467   // A null weak handle resolves to null.
4468   cmpptr(rresult, 0);
4469   jcc(Assembler::equal, resolved);
4470 
4471   // Only 64 bit platforms support GCs that require a tmp register
4472   // Only IN_HEAP loads require a thread_tmp register
4473   // WeakHandle::resolve is an indirection like jweak.
4474   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
4475                  rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg);
4476   bind(resolved);
4477 }
4478 
4479 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
4480   // get mirror
4481   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
4482   load_method_holder(mirror, method);
4483   movptr(mirror, Address(mirror, mirror_offset));
4484   resolve_oop_handle(mirror, tmp);
4485 }
4486 
4487 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
4488   load_method_holder(rresult, rmethod);
4489   movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
4490 }
4491 
4492 void MacroAssembler::load_method_holder(Register holder, Register method) {
4493   movptr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
4494   movptr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
4495   movptr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
4496 }
4497 
4498 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
4499   assert_different_registers(src, tmp);
4500   assert_different_registers(dst, tmp);
4501 #ifdef _LP64
4502   if (UseCompressedClassPointers) {
4503     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4504     decode_klass_not_null(dst, tmp);
4505   } else
4506 #endif
4507     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
4508 }
4509 
4510 void MacroAssembler::load_prototype_header(Register dst, Register src, Register tmp) {
4511   load_klass(dst, src, tmp);
4512   movptr(dst, Address(dst, Klass::prototype_header_offset()));
4513 }
4514 
4515 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
4516   assert_different_registers(src, tmp);
4517   assert_different_registers(dst, tmp);
4518 #ifdef _LP64
4519   if (UseCompressedClassPointers) {
4520     encode_klass_not_null(src, tmp);
4521     movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
4522   } else
4523 #endif
4524     movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
4525 }
4526 
4527 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
4528                                     Register tmp1, Register thread_tmp) {
4529   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4530   decorators = AccessInternal::decorator_fixup(decorators);
4531   bool as_raw = (decorators & AS_RAW) != 0;
4532   if (as_raw) {
4533     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4534   } else {
4535     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4536   }
4537 }
4538 
4539 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src,
4540                                      Register tmp1, Register tmp2) {
4541   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4542   decorators = AccessInternal::decorator_fixup(decorators);
4543   bool as_raw = (decorators & AS_RAW) != 0;
4544   if (as_raw) {
4545     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, tmp2);
4546   } else {
4547     bs->store_at(this, decorators, type, dst, src, tmp1, tmp2);
4548   }
4549 }
4550 
4551 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
4552   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
4553   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
4554     decorators |= ACCESS_READ | ACCESS_WRITE;
4555   }
4556   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4557   return bs->resolve(this, decorators, obj);
4558 }
4559 
4560 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4561                                    Register thread_tmp, DecoratorSet decorators) {
4562   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4563 }
4564 
4565 // Doesn't do verfication, generates fixed size code
4566 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4567                                             Register thread_tmp, DecoratorSet decorators) {
4568   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4569 }
4570 
4571 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4572                                     Register tmp2, DecoratorSet decorators) {
4573   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
4574 }
4575 
4576 // Used for storing NULLs.
4577 void MacroAssembler::store_heap_oop_null(Address dst) {
4578   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4579 }
4580 
4581 #ifdef _LP64
4582 void MacroAssembler::store_klass_gap(Register dst, Register src) {
4583   if (UseCompressedClassPointers) {
4584     // Store to klass gap in destination
4585     movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
4586   }
4587 }
4588 
4589 #ifdef ASSERT
4590 void MacroAssembler::verify_heapbase(const char* msg) {
4591   assert (UseCompressedOops, "should be compressed");
4592   assert (Universe::heap() != NULL, "java heap should be initialized");
4593   if (CheckCompressedOops) {
4594     Label ok;
4595     push(rscratch1); // cmpptr trashes rscratch1
4596     cmpptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
4597     jcc(Assembler::equal, ok);
4598     STOP(msg);
4599     bind(ok);
4600     pop(rscratch1);
4601   }
4602 }
4603 #endif
4604 
4605 // Algorithm must match oop.inline.hpp encode_heap_oop.
4606 void MacroAssembler::encode_heap_oop(Register r) {
4607 #ifdef ASSERT
4608   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
4609 #endif
4610   verify_oop_msg(r, "broken oop in encode_heap_oop");
4611   if (CompressedOops::base() == NULL) {
4612     if (CompressedOops::shift() != 0) {
4613       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4614       shrq(r, LogMinObjAlignmentInBytes);
4615     }
4616     return;
4617   }
4618   testq(r, r);
4619   cmovq(Assembler::equal, r, r12_heapbase);
4620   subq(r, r12_heapbase);
4621   shrq(r, LogMinObjAlignmentInBytes);
4622 }
4623 
4624 void MacroAssembler::encode_heap_oop_not_null(Register r) {
4625 #ifdef ASSERT
4626   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
4627   if (CheckCompressedOops) {
4628     Label ok;
4629     testq(r, r);
4630     jcc(Assembler::notEqual, ok);
4631     STOP("null oop passed to encode_heap_oop_not_null");
4632     bind(ok);
4633   }
4634 #endif
4635   verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
4636   if (CompressedOops::base() != NULL) {
4637     subq(r, r12_heapbase);
4638   }
4639   if (CompressedOops::shift() != 0) {
4640     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4641     shrq(r, LogMinObjAlignmentInBytes);
4642   }
4643 }
4644 
4645 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
4646 #ifdef ASSERT
4647   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
4648   if (CheckCompressedOops) {
4649     Label ok;
4650     testq(src, src);
4651     jcc(Assembler::notEqual, ok);
4652     STOP("null oop passed to encode_heap_oop_not_null2");
4653     bind(ok);
4654   }
4655 #endif
4656   verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
4657   if (dst != src) {
4658     movq(dst, src);
4659   }
4660   if (CompressedOops::base() != NULL) {
4661     subq(dst, r12_heapbase);
4662   }
4663   if (CompressedOops::shift() != 0) {
4664     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4665     shrq(dst, LogMinObjAlignmentInBytes);
4666   }
4667 }
4668 
4669 void  MacroAssembler::decode_heap_oop(Register r) {
4670 #ifdef ASSERT
4671   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
4672 #endif
4673   if (CompressedOops::base() == NULL) {
4674     if (CompressedOops::shift() != 0) {
4675       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4676       shlq(r, LogMinObjAlignmentInBytes);
4677     }
4678   } else {
4679     Label done;
4680     shlq(r, LogMinObjAlignmentInBytes);
4681     jccb(Assembler::equal, done);
4682     addq(r, r12_heapbase);
4683     bind(done);
4684   }
4685   verify_oop_msg(r, "broken oop in decode_heap_oop");
4686 }
4687 
4688 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
4689   // Note: it will change flags
4690   assert (UseCompressedOops, "should only be used for compressed headers");
4691   assert (Universe::heap() != NULL, "java heap should be initialized");
4692   // Cannot assert, unverified entry point counts instructions (see .ad file)
4693   // vtableStubs also counts instructions in pd_code_size_limit.
4694   // Also do not verify_oop as this is called by verify_oop.
4695   if (CompressedOops::shift() != 0) {
4696     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4697     shlq(r, LogMinObjAlignmentInBytes);
4698     if (CompressedOops::base() != NULL) {
4699       addq(r, r12_heapbase);
4700     }
4701   } else {
4702     assert (CompressedOops::base() == NULL, "sanity");
4703   }
4704 }
4705 
4706 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
4707   // Note: it will change flags
4708   assert (UseCompressedOops, "should only be used for compressed headers");
4709   assert (Universe::heap() != NULL, "java heap should be initialized");
4710   // Cannot assert, unverified entry point counts instructions (see .ad file)
4711   // vtableStubs also counts instructions in pd_code_size_limit.
4712   // Also do not verify_oop as this is called by verify_oop.
4713   if (CompressedOops::shift() != 0) {
4714     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4715     if (LogMinObjAlignmentInBytes == Address::times_8) {
4716       leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
4717     } else {
4718       if (dst != src) {
4719         movq(dst, src);
4720       }
4721       shlq(dst, LogMinObjAlignmentInBytes);
4722       if (CompressedOops::base() != NULL) {
4723         addq(dst, r12_heapbase);
4724       }
4725     }
4726   } else {
4727     assert (CompressedOops::base() == NULL, "sanity");
4728     if (dst != src) {
4729       movq(dst, src);
4730     }
4731   }
4732 }
4733 
4734 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
4735   assert_different_registers(r, tmp);
4736   if (CompressedKlassPointers::base() != NULL) {
4737     mov64(tmp, (int64_t)CompressedKlassPointers::base());
4738     subq(r, tmp);
4739   }
4740   if (CompressedKlassPointers::shift() != 0) {
4741     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4742     shrq(r, LogKlassAlignmentInBytes);
4743   }
4744 }
4745 
4746 void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) {
4747   assert_different_registers(src, dst);
4748   if (CompressedKlassPointers::base() != NULL) {
4749     mov64(dst, -(int64_t)CompressedKlassPointers::base());
4750     addq(dst, src);
4751   } else {
4752     movptr(dst, src);
4753   }
4754   if (CompressedKlassPointers::shift() != 0) {
4755     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4756     shrq(dst, LogKlassAlignmentInBytes);
4757   }
4758 }
4759 
4760 // !!! If the instructions that get generated here change then function
4761 // instr_size_for_decode_klass_not_null() needs to get updated.
4762 void  MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
4763   assert_different_registers(r, tmp);
4764   // Note: it will change flags
4765   assert(UseCompressedClassPointers, "should only be used for compressed headers");
4766   // Cannot assert, unverified entry point counts instructions (see .ad file)
4767   // vtableStubs also counts instructions in pd_code_size_limit.
4768   // Also do not verify_oop as this is called by verify_oop.
4769   if (CompressedKlassPointers::shift() != 0) {
4770     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4771     shlq(r, LogKlassAlignmentInBytes);
4772   }
4773   if (CompressedKlassPointers::base() != NULL) {
4774     mov64(tmp, (int64_t)CompressedKlassPointers::base());
4775     addq(r, tmp);
4776   }
4777 }
4778 
4779 void  MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) {
4780   assert_different_registers(src, dst);
4781   // Note: it will change flags
4782   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4783   // Cannot assert, unverified entry point counts instructions (see .ad file)
4784   // vtableStubs also counts instructions in pd_code_size_limit.
4785   // Also do not verify_oop as this is called by verify_oop.
4786 
4787   if (CompressedKlassPointers::base() == NULL &&
4788       CompressedKlassPointers::shift() == 0) {
4789     // The best case scenario is that there is no base or shift. Then it is already
4790     // a pointer that needs nothing but a register rename.
4791     movl(dst, src);
4792   } else {
4793     if (CompressedKlassPointers::base() != NULL) {
4794       mov64(dst, (int64_t)CompressedKlassPointers::base());
4795     } else {
4796       xorq(dst, dst);
4797     }
4798     if (CompressedKlassPointers::shift() != 0) {
4799       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4800       assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
4801       leaq(dst, Address(dst, src, Address::times_8, 0));
4802     } else {
4803       addq(dst, src);
4804     }
4805   }
4806 }
4807 
4808 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4809   assert (UseCompressedOops, "should only be used for compressed headers");
4810   assert (Universe::heap() != NULL, "java heap should be initialized");
4811   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4812   int oop_index = oop_recorder()->find_index(obj);
4813   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4814   mov_narrow_oop(dst, oop_index, rspec);
4815 }
4816 
4817 void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
4818   assert (UseCompressedOops, "should only be used for compressed headers");
4819   assert (Universe::heap() != NULL, "java heap should be initialized");
4820   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4821   int oop_index = oop_recorder()->find_index(obj);
4822   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4823   mov_narrow_oop(dst, oop_index, rspec);
4824 }
4825 
4826 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4827   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4828   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4829   int klass_index = oop_recorder()->find_index(k);
4830   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
4831   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
4832 }
4833 
4834 void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
4835   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4836   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4837   int klass_index = oop_recorder()->find_index(k);
4838   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
4839   mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
4840 }
4841 
4842 void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
4843   assert (UseCompressedOops, "should only be used for compressed headers");
4844   assert (Universe::heap() != NULL, "java heap should be initialized");
4845   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4846   int oop_index = oop_recorder()->find_index(obj);
4847   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4848   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
4849 }
4850 
4851 void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
4852   assert (UseCompressedOops, "should only be used for compressed headers");
4853   assert (Universe::heap() != NULL, "java heap should be initialized");
4854   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4855   int oop_index = oop_recorder()->find_index(obj);
4856   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4857   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
4858 }
4859 
4860 void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
4861   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4862   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4863   int klass_index = oop_recorder()->find_index(k);
4864   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
4865   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
4866 }
4867 
4868 void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
4869   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4870   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4871   int klass_index = oop_recorder()->find_index(k);
4872   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
4873   Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
4874 }
4875 
4876 void MacroAssembler::reinit_heapbase() {
4877   if (UseCompressedOops) {
4878     if (Universe::heap() != NULL) {
4879       if (CompressedOops::base() == NULL) {
4880         MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
4881       } else {
4882         mov64(r12_heapbase, (int64_t)CompressedOops::ptrs_base());
4883       }
4884     } else {
4885       movptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
4886     }
4887   }
4888 }
4889 
4890 #endif // _LP64
4891 
4892 // C2 compiled method's prolog code.
4893 void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
4894 
4895   // WARNING: Initial instruction MUST be 5 bytes or longer so that
4896   // NativeJump::patch_verified_entry will be able to patch out the entry
4897   // code safely. The push to verify stack depth is ok at 5 bytes,
4898   // the frame allocation can be either 3 or 6 bytes. So if we don't do
4899   // stack bang then we must use the 6 byte frame allocation even if
4900   // we have no frame. :-(
4901   assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
4902 
4903   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
4904   // Remove word for return addr
4905   framesize -= wordSize;
4906   stack_bang_size -= wordSize;
4907 
4908   // Calls to C2R adapters often do not accept exceptional returns.
4909   // We require that their callers must bang for them.  But be careful, because
4910   // some VM calls (such as call site linkage) can use several kilobytes of
4911   // stack.  But the stack safety zone should account for that.
4912   // See bugs 4446381, 4468289, 4497237.
4913   if (stack_bang_size > 0) {
4914     generate_stack_overflow_check(stack_bang_size);
4915 
4916     // We always push rbp, so that on return to interpreter rbp, will be
4917     // restored correctly and we can correct the stack.
4918     push(rbp);
4919     // Save caller's stack pointer into RBP if the frame pointer is preserved.
4920     if (PreserveFramePointer) {
4921       mov(rbp, rsp);
4922     }
4923     // Remove word for ebp
4924     framesize -= wordSize;
4925 
4926     // Create frame
4927     if (framesize) {
4928       subptr(rsp, framesize);
4929     }
4930   } else {
4931     // Create frame (force generation of a 4 byte immediate value)
4932     subptr_imm32(rsp, framesize);
4933 
4934     // Save RBP register now.
4935     framesize -= wordSize;
4936     movptr(Address(rsp, framesize), rbp);
4937     // Save caller's stack pointer into RBP if the frame pointer is preserved.
4938     if (PreserveFramePointer) {
4939       movptr(rbp, rsp);
4940       if (framesize > 0) {
4941         addptr(rbp, framesize);
4942       }
4943     }
4944   }
4945 
4946   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
4947     framesize -= wordSize;
4948     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
4949   }
4950 
4951 #ifndef _LP64
4952   // If method sets FPU control word do it now
4953   if (fp_mode_24b) {
4954     fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
4955   }
4956   if (UseSSE >= 2 && VerifyFPU) {
4957     verify_FPU(0, "FPU stack must be clean on entry");
4958   }
4959 #endif
4960 
4961 #ifdef ASSERT
4962   if (VerifyStackAtCalls) {
4963     Label L;
4964     push(rax);
4965     mov(rax, rsp);
4966     andptr(rax, StackAlignmentInBytes-1);
4967     cmpptr(rax, StackAlignmentInBytes-wordSize);
4968     pop(rax);
4969     jcc(Assembler::equal, L);
4970     STOP("Stack is not properly aligned!");
4971     bind(L);
4972   }
4973 #endif
4974 
4975   if (!is_stub) {
4976     BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4977     bs->nmethod_entry_barrier(this);
4978   }
4979 }
4980 
4981 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers
4982 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, XMMRegister xtmp) {
4983   // cnt - number of qwords (8-byte words).
4984   // base - start address, qword aligned.
4985   Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end;
4986   if (UseAVX >= 2) {
4987     vpxor(xtmp, xtmp, xtmp, AVX_256bit);
4988   } else {
4989     pxor(xtmp, xtmp);
4990   }
4991   jmp(L_zero_64_bytes);
4992 
4993   BIND(L_loop);
4994   if (UseAVX >= 2) {
4995     vmovdqu(Address(base,  0), xtmp);
4996     vmovdqu(Address(base, 32), xtmp);
4997   } else {
4998     movdqu(Address(base,  0), xtmp);
4999     movdqu(Address(base, 16), xtmp);
5000     movdqu(Address(base, 32), xtmp);
5001     movdqu(Address(base, 48), xtmp);
5002   }
5003   addptr(base, 64);
5004 
5005   BIND(L_zero_64_bytes);
5006   subptr(cnt, 8);
5007   jccb(Assembler::greaterEqual, L_loop);
5008   addptr(cnt, 4);
5009   jccb(Assembler::less, L_tail);
5010   // Copy trailing 32 bytes
5011   if (UseAVX >= 2) {
5012     vmovdqu(Address(base, 0), xtmp);
5013   } else {
5014     movdqu(Address(base,  0), xtmp);
5015     movdqu(Address(base, 16), xtmp);
5016   }
5017   addptr(base, 32);
5018   subptr(cnt, 4);
5019 
5020   BIND(L_tail);
5021   addptr(cnt, 4);
5022   jccb(Assembler::lessEqual, L_end);
5023   decrement(cnt);
5024 
5025   BIND(L_sloop);
5026   movq(Address(base, 0), xtmp);
5027   addptr(base, 8);
5028   decrement(cnt);
5029   jccb(Assembler::greaterEqual, L_sloop);
5030   BIND(L_end);
5031 }
5032 
5033 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp, bool is_large) {
5034   // cnt - number of qwords (8-byte words).
5035   // base - start address, qword aligned.
5036   // is_large - if optimizers know cnt is larger than InitArrayShortSize
5037   assert(base==rdi, "base register must be edi for rep stos");
5038   assert(tmp==rax,   "tmp register must be eax for rep stos");
5039   assert(cnt==rcx,   "cnt register must be ecx for rep stos");
5040   assert(InitArrayShortSize % BytesPerLong == 0,
5041     "InitArrayShortSize should be the multiple of BytesPerLong");
5042 
5043   Label DONE;
5044 
5045   if (!is_large || !UseXMMForObjInit) {
5046     xorptr(tmp, tmp);
5047   }
5048 
5049   if (!is_large) {
5050     Label LOOP, LONG;
5051     cmpptr(cnt, InitArrayShortSize/BytesPerLong);
5052     jccb(Assembler::greater, LONG);
5053 
5054     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5055 
5056     decrement(cnt);
5057     jccb(Assembler::negative, DONE); // Zero length
5058 
5059     // Use individual pointer-sized stores for small counts:
5060     BIND(LOOP);
5061     movptr(Address(base, cnt, Address::times_ptr), tmp);
5062     decrement(cnt);
5063     jccb(Assembler::greaterEqual, LOOP);
5064     jmpb(DONE);
5065 
5066     BIND(LONG);
5067   }
5068 
5069   // Use longer rep-prefixed ops for non-small counts:
5070   if (UseFastStosb) {
5071     shlptr(cnt, 3); // convert to number of bytes
5072     rep_stosb();
5073   } else if (UseXMMForObjInit) {
5074     movptr(tmp, base);
5075     xmm_clear_mem(tmp, cnt, xtmp);
5076   } else {
5077     NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
5078     rep_stos();
5079   }
5080 
5081   BIND(DONE);
5082 }
5083 
5084 void MacroAssembler::generate_fill(BasicType t, bool aligned,
5085                                    Register to, Register value, Register count,
5086                                    Register rtmp, XMMRegister xtmp) {
5087   ShortBranchVerifier sbv(this);
5088   assert_different_registers(to, value, count, rtmp);
5089   Label L_exit;
5090   Label L_fill_2_bytes, L_fill_4_bytes;
5091 
5092   int shift = -1;
5093   switch (t) {
5094     case T_BYTE:
5095       shift = 2;
5096       break;
5097     case T_SHORT:
5098       shift = 1;
5099       break;
5100     case T_INT:
5101       shift = 0;
5102       break;
5103     default: ShouldNotReachHere();
5104   }
5105 
5106   if (t == T_BYTE) {
5107     andl(value, 0xff);
5108     movl(rtmp, value);
5109     shll(rtmp, 8);
5110     orl(value, rtmp);
5111   }
5112   if (t == T_SHORT) {
5113     andl(value, 0xffff);
5114   }
5115   if (t == T_BYTE || t == T_SHORT) {
5116     movl(rtmp, value);
5117     shll(rtmp, 16);
5118     orl(value, rtmp);
5119   }
5120 
5121   cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
5122   jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
5123   if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
5124     Label L_skip_align2;
5125     // align source address at 4 bytes address boundary
5126     if (t == T_BYTE) {
5127       Label L_skip_align1;
5128       // One byte misalignment happens only for byte arrays
5129       testptr(to, 1);
5130       jccb(Assembler::zero, L_skip_align1);
5131       movb(Address(to, 0), value);
5132       increment(to);
5133       decrement(count);
5134       BIND(L_skip_align1);
5135     }
5136     // Two bytes misalignment happens only for byte and short (char) arrays
5137     testptr(to, 2);
5138     jccb(Assembler::zero, L_skip_align2);
5139     movw(Address(to, 0), value);
5140     addptr(to, 2);
5141     subl(count, 1<<(shift-1));
5142     BIND(L_skip_align2);
5143   }
5144   if (UseSSE < 2) {
5145     Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5146     // Fill 32-byte chunks
5147     subl(count, 8 << shift);
5148     jcc(Assembler::less, L_check_fill_8_bytes);
5149     align(16);
5150 
5151     BIND(L_fill_32_bytes_loop);
5152 
5153     for (int i = 0; i < 32; i += 4) {
5154       movl(Address(to, i), value);
5155     }
5156 
5157     addptr(to, 32);
5158     subl(count, 8 << shift);
5159     jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
5160     BIND(L_check_fill_8_bytes);
5161     addl(count, 8 << shift);
5162     jccb(Assembler::zero, L_exit);
5163     jmpb(L_fill_8_bytes);
5164 
5165     //
5166     // length is too short, just fill qwords
5167     //
5168     BIND(L_fill_8_bytes_loop);
5169     movl(Address(to, 0), value);
5170     movl(Address(to, 4), value);
5171     addptr(to, 8);
5172     BIND(L_fill_8_bytes);
5173     subl(count, 1 << (shift + 1));
5174     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5175     // fall through to fill 4 bytes
5176   } else {
5177     Label L_fill_32_bytes;
5178     if (!UseUnalignedLoadStores) {
5179       // align to 8 bytes, we know we are 4 byte aligned to start
5180       testptr(to, 4);
5181       jccb(Assembler::zero, L_fill_32_bytes);
5182       movl(Address(to, 0), value);
5183       addptr(to, 4);
5184       subl(count, 1<<shift);
5185     }
5186     BIND(L_fill_32_bytes);
5187     {
5188       assert( UseSSE >= 2, "supported cpu only" );
5189       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
5190       movdl(xtmp, value);
5191       if (UseAVX >= 2 && UseUnalignedLoadStores) {
5192         Label L_check_fill_32_bytes;
5193         if (UseAVX > 2) {
5194           // Fill 64-byte chunks
5195           Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
5196 
5197           // If number of bytes to fill < AVX3Threshold, perform fill using AVX2
5198           cmpl(count, AVX3Threshold);
5199           jccb(Assembler::below, L_check_fill_64_bytes_avx2);
5200 
5201           vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
5202 
5203           subl(count, 16 << shift);
5204           jccb(Assembler::less, L_check_fill_32_bytes);
5205           align(16);
5206 
5207           BIND(L_fill_64_bytes_loop_avx3);
5208           evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
5209           addptr(to, 64);
5210           subl(count, 16 << shift);
5211           jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3);
5212           jmpb(L_check_fill_32_bytes);
5213 
5214           BIND(L_check_fill_64_bytes_avx2);
5215         }
5216         // Fill 64-byte chunks
5217         Label L_fill_64_bytes_loop;
5218         vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit);
5219 
5220         subl(count, 16 << shift);
5221         jcc(Assembler::less, L_check_fill_32_bytes);
5222         align(16);
5223 
5224         BIND(L_fill_64_bytes_loop);
5225         vmovdqu(Address(to, 0), xtmp);
5226         vmovdqu(Address(to, 32), xtmp);
5227         addptr(to, 64);
5228         subl(count, 16 << shift);
5229         jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
5230 
5231         BIND(L_check_fill_32_bytes);
5232         addl(count, 8 << shift);
5233         jccb(Assembler::less, L_check_fill_8_bytes);
5234         vmovdqu(Address(to, 0), xtmp);
5235         addptr(to, 32);
5236         subl(count, 8 << shift);
5237 
5238         BIND(L_check_fill_8_bytes);
5239         // clean upper bits of YMM registers
5240         movdl(xtmp, value);
5241         pshufd(xtmp, xtmp, 0);
5242       } else {
5243         // Fill 32-byte chunks
5244         pshufd(xtmp, xtmp, 0);
5245 
5246         subl(count, 8 << shift);
5247         jcc(Assembler::less, L_check_fill_8_bytes);
5248         align(16);
5249 
5250         BIND(L_fill_32_bytes_loop);
5251 
5252         if (UseUnalignedLoadStores) {
5253           movdqu(Address(to, 0), xtmp);
5254           movdqu(Address(to, 16), xtmp);
5255         } else {
5256           movq(Address(to, 0), xtmp);
5257           movq(Address(to, 8), xtmp);
5258           movq(Address(to, 16), xtmp);
5259           movq(Address(to, 24), xtmp);
5260         }
5261 
5262         addptr(to, 32);
5263         subl(count, 8 << shift);
5264         jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
5265 
5266         BIND(L_check_fill_8_bytes);
5267       }
5268       addl(count, 8 << shift);
5269       jccb(Assembler::zero, L_exit);
5270       jmpb(L_fill_8_bytes);
5271 
5272       //
5273       // length is too short, just fill qwords
5274       //
5275       BIND(L_fill_8_bytes_loop);
5276       movq(Address(to, 0), xtmp);
5277       addptr(to, 8);
5278       BIND(L_fill_8_bytes);
5279       subl(count, 1 << (shift + 1));
5280       jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
5281     }
5282   }
5283   // fill trailing 4 bytes
5284   BIND(L_fill_4_bytes);
5285   testl(count, 1<<shift);
5286   jccb(Assembler::zero, L_fill_2_bytes);
5287   movl(Address(to, 0), value);
5288   if (t == T_BYTE || t == T_SHORT) {
5289     Label L_fill_byte;
5290     addptr(to, 4);
5291     BIND(L_fill_2_bytes);
5292     // fill trailing 2 bytes
5293     testl(count, 1<<(shift-1));
5294     jccb(Assembler::zero, L_fill_byte);
5295     movw(Address(to, 0), value);
5296     if (t == T_BYTE) {
5297       addptr(to, 2);
5298       BIND(L_fill_byte);
5299       // fill trailing byte
5300       testl(count, 1);
5301       jccb(Assembler::zero, L_exit);
5302       movb(Address(to, 0), value);
5303     } else {
5304       BIND(L_fill_byte);
5305     }
5306   } else {
5307     BIND(L_fill_2_bytes);
5308   }
5309   BIND(L_exit);
5310 }
5311 
5312 // encode char[] to byte[] in ISO_8859_1
5313    //@HotSpotIntrinsicCandidate
5314    //private static int implEncodeISOArray(byte[] sa, int sp,
5315    //byte[] da, int dp, int len) {
5316    //  int i = 0;
5317    //  for (; i < len; i++) {
5318    //    char c = StringUTF16.getChar(sa, sp++);
5319    //    if (c > '\u00FF')
5320    //      break;
5321    //    da[dp++] = (byte)c;
5322    //  }
5323    //  return i;
5324    //}
5325 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
5326   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
5327   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
5328   Register tmp5, Register result) {
5329 
5330   // rsi: src
5331   // rdi: dst
5332   // rdx: len
5333   // rcx: tmp5
5334   // rax: result
5335   ShortBranchVerifier sbv(this);
5336   assert_different_registers(src, dst, len, tmp5, result);
5337   Label L_done, L_copy_1_char, L_copy_1_char_exit;
5338 
5339   // set result
5340   xorl(result, result);
5341   // check for zero length
5342   testl(len, len);
5343   jcc(Assembler::zero, L_done);
5344 
5345   movl(result, len);
5346 
5347   // Setup pointers
5348   lea(src, Address(src, len, Address::times_2)); // char[]
5349   lea(dst, Address(dst, len, Address::times_1)); // byte[]
5350   negptr(len);
5351 
5352   if (UseSSE42Intrinsics || UseAVX >= 2) {
5353     Label L_copy_8_chars, L_copy_8_chars_exit;
5354     Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
5355 
5356     if (UseAVX >= 2) {
5357       Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
5358       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
5359       movdl(tmp1Reg, tmp5);
5360       vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
5361       jmp(L_chars_32_check);
5362 
5363       bind(L_copy_32_chars);
5364       vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
5365       vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
5366       vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
5367       vptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
5368       jccb(Assembler::notZero, L_copy_32_chars_exit);
5369       vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
5370       vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
5371       vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
5372 
5373       bind(L_chars_32_check);
5374       addptr(len, 32);
5375       jcc(Assembler::lessEqual, L_copy_32_chars);
5376 
5377       bind(L_copy_32_chars_exit);
5378       subptr(len, 16);
5379       jccb(Assembler::greater, L_copy_16_chars_exit);
5380 
5381     } else if (UseSSE42Intrinsics) {
5382       movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
5383       movdl(tmp1Reg, tmp5);
5384       pshufd(tmp1Reg, tmp1Reg, 0);
5385       jmpb(L_chars_16_check);
5386     }
5387 
5388     bind(L_copy_16_chars);
5389     if (UseAVX >= 2) {
5390       vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
5391       vptest(tmp2Reg, tmp1Reg);
5392       jcc(Assembler::notZero, L_copy_16_chars_exit);
5393       vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
5394       vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
5395     } else {
5396       if (UseAVX > 0) {
5397         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
5398         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
5399         vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
5400       } else {
5401         movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
5402         por(tmp2Reg, tmp3Reg);
5403         movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
5404         por(tmp2Reg, tmp4Reg);
5405       }
5406       ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
5407       jccb(Assembler::notZero, L_copy_16_chars_exit);
5408       packuswb(tmp3Reg, tmp4Reg);
5409     }
5410     movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
5411 
5412     bind(L_chars_16_check);
5413     addptr(len, 16);
5414     jcc(Assembler::lessEqual, L_copy_16_chars);
5415 
5416     bind(L_copy_16_chars_exit);
5417     if (UseAVX >= 2) {
5418       // clean upper bits of YMM registers
5419       vpxor(tmp2Reg, tmp2Reg);
5420       vpxor(tmp3Reg, tmp3Reg);
5421       vpxor(tmp4Reg, tmp4Reg);
5422       movdl(tmp1Reg, tmp5);
5423       pshufd(tmp1Reg, tmp1Reg, 0);
5424     }
5425     subptr(len, 8);
5426     jccb(Assembler::greater, L_copy_8_chars_exit);
5427 
5428     bind(L_copy_8_chars);
5429     movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
5430     ptest(tmp3Reg, tmp1Reg);
5431     jccb(Assembler::notZero, L_copy_8_chars_exit);
5432     packuswb(tmp3Reg, tmp1Reg);
5433     movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
5434     addptr(len, 8);
5435     jccb(Assembler::lessEqual, L_copy_8_chars);
5436 
5437     bind(L_copy_8_chars_exit);
5438     subptr(len, 8);
5439     jccb(Assembler::zero, L_done);
5440   }
5441 
5442   bind(L_copy_1_char);
5443   load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
5444   testl(tmp5, 0xff00);      // check if Unicode char
5445   jccb(Assembler::notZero, L_copy_1_char_exit);
5446   movb(Address(dst, len, Address::times_1, 0), tmp5);
5447   addptr(len, 1);
5448   jccb(Assembler::less, L_copy_1_char);
5449 
5450   bind(L_copy_1_char_exit);
5451   addptr(result, len); // len is negative count of not processed elements
5452 
5453   bind(L_done);
5454 }
5455 
5456 #ifdef _LP64
5457 /**
5458  * Helper for multiply_to_len().
5459  */
5460 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
5461   addq(dest_lo, src1);
5462   adcq(dest_hi, 0);
5463   addq(dest_lo, src2);
5464   adcq(dest_hi, 0);
5465 }
5466 
5467 /**
5468  * Multiply 64 bit by 64 bit first loop.
5469  */
5470 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
5471                                            Register y, Register y_idx, Register z,
5472                                            Register carry, Register product,
5473                                            Register idx, Register kdx) {
5474   //
5475   //  jlong carry, x[], y[], z[];
5476   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
5477   //    huge_128 product = y[idx] * x[xstart] + carry;
5478   //    z[kdx] = (jlong)product;
5479   //    carry  = (jlong)(product >>> 64);
5480   //  }
5481   //  z[xstart] = carry;
5482   //
5483 
5484   Label L_first_loop, L_first_loop_exit;
5485   Label L_one_x, L_one_y, L_multiply;
5486 
5487   decrementl(xstart);
5488   jcc(Assembler::negative, L_one_x);
5489 
5490   movq(x_xstart, Address(x, xstart, Address::times_4,  0));
5491   rorq(x_xstart, 32); // convert big-endian to little-endian
5492 
5493   bind(L_first_loop);
5494   decrementl(idx);
5495   jcc(Assembler::negative, L_first_loop_exit);
5496   decrementl(idx);
5497   jcc(Assembler::negative, L_one_y);
5498   movq(y_idx, Address(y, idx, Address::times_4,  0));
5499   rorq(y_idx, 32); // convert big-endian to little-endian
5500   bind(L_multiply);
5501   movq(product, x_xstart);
5502   mulq(y_idx); // product(rax) * y_idx -> rdx:rax
5503   addq(product, carry);
5504   adcq(rdx, 0);
5505   subl(kdx, 2);
5506   movl(Address(z, kdx, Address::times_4,  4), product);
5507   shrq(product, 32);
5508   movl(Address(z, kdx, Address::times_4,  0), product);
5509   movq(carry, rdx);
5510   jmp(L_first_loop);
5511 
5512   bind(L_one_y);
5513   movl(y_idx, Address(y,  0));
5514   jmp(L_multiply);
5515 
5516   bind(L_one_x);
5517   movl(x_xstart, Address(x,  0));
5518   jmp(L_first_loop);
5519 
5520   bind(L_first_loop_exit);
5521 }
5522 
5523 /**
5524  * Multiply 64 bit by 64 bit and add 128 bit.
5525  */
5526 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
5527                                             Register yz_idx, Register idx,
5528                                             Register carry, Register product, int offset) {
5529   //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
5530   //     z[kdx] = (jlong)product;
5531 
5532   movq(yz_idx, Address(y, idx, Address::times_4,  offset));
5533   rorq(yz_idx, 32); // convert big-endian to little-endian
5534   movq(product, x_xstart);
5535   mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
5536   movq(yz_idx, Address(z, idx, Address::times_4,  offset));
5537   rorq(yz_idx, 32); // convert big-endian to little-endian
5538 
5539   add2_with_carry(rdx, product, carry, yz_idx);
5540 
5541   movl(Address(z, idx, Address::times_4,  offset+4), product);
5542   shrq(product, 32);
5543   movl(Address(z, idx, Address::times_4,  offset), product);
5544 
5545 }
5546 
5547 /**
5548  * Multiply 128 bit by 128 bit. Unrolled inner loop.
5549  */
5550 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
5551                                              Register yz_idx, Register idx, Register jdx,
5552                                              Register carry, Register product,
5553                                              Register carry2) {
5554   //   jlong carry, x[], y[], z[];
5555   //   int kdx = ystart+1;
5556   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5557   //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
5558   //     z[kdx+idx+1] = (jlong)product;
5559   //     jlong carry2  = (jlong)(product >>> 64);
5560   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
5561   //     z[kdx+idx] = (jlong)product;
5562   //     carry  = (jlong)(product >>> 64);
5563   //   }
5564   //   idx += 2;
5565   //   if (idx > 0) {
5566   //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
5567   //     z[kdx+idx] = (jlong)product;
5568   //     carry  = (jlong)(product >>> 64);
5569   //   }
5570   //
5571 
5572   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5573 
5574   movl(jdx, idx);
5575   andl(jdx, 0xFFFFFFFC);
5576   shrl(jdx, 2);
5577 
5578   bind(L_third_loop);
5579   subl(jdx, 1);
5580   jcc(Assembler::negative, L_third_loop_exit);
5581   subl(idx, 4);
5582 
5583   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
5584   movq(carry2, rdx);
5585 
5586   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
5587   movq(carry, rdx);
5588   jmp(L_third_loop);
5589 
5590   bind (L_third_loop_exit);
5591 
5592   andl (idx, 0x3);
5593   jcc(Assembler::zero, L_post_third_loop_done);
5594 
5595   Label L_check_1;
5596   subl(idx, 2);
5597   jcc(Assembler::negative, L_check_1);
5598 
5599   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
5600   movq(carry, rdx);
5601 
5602   bind (L_check_1);
5603   addl (idx, 0x2);
5604   andl (idx, 0x1);
5605   subl(idx, 1);
5606   jcc(Assembler::negative, L_post_third_loop_done);
5607 
5608   movl(yz_idx, Address(y, idx, Address::times_4,  0));
5609   movq(product, x_xstart);
5610   mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
5611   movl(yz_idx, Address(z, idx, Address::times_4,  0));
5612 
5613   add2_with_carry(rdx, product, yz_idx, carry);
5614 
5615   movl(Address(z, idx, Address::times_4,  0), product);
5616   shrq(product, 32);
5617 
5618   shlq(rdx, 32);
5619   orq(product, rdx);
5620   movq(carry, product);
5621 
5622   bind(L_post_third_loop_done);
5623 }
5624 
5625 /**
5626  * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
5627  *
5628  */
5629 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
5630                                                   Register carry, Register carry2,
5631                                                   Register idx, Register jdx,
5632                                                   Register yz_idx1, Register yz_idx2,
5633                                                   Register tmp, Register tmp3, Register tmp4) {
5634   assert(UseBMI2Instructions, "should be used only when BMI2 is available");
5635 
5636   //   jlong carry, x[], y[], z[];
5637   //   int kdx = ystart+1;
5638   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5639   //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
5640   //     jlong carry2  = (jlong)(tmp3 >>> 64);
5641   //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
5642   //     carry  = (jlong)(tmp4 >>> 64);
5643   //     z[kdx+idx+1] = (jlong)tmp3;
5644   //     z[kdx+idx] = (jlong)tmp4;
5645   //   }
5646   //   idx += 2;
5647   //   if (idx > 0) {
5648   //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
5649   //     z[kdx+idx] = (jlong)yz_idx1;
5650   //     carry  = (jlong)(yz_idx1 >>> 64);
5651   //   }
5652   //
5653 
5654   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5655 
5656   movl(jdx, idx);
5657   andl(jdx, 0xFFFFFFFC);
5658   shrl(jdx, 2);
5659 
5660   bind(L_third_loop);
5661   subl(jdx, 1);
5662   jcc(Assembler::negative, L_third_loop_exit);
5663   subl(idx, 4);
5664 
5665   movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
5666   rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
5667   movq(yz_idx2, Address(y, idx, Address::times_4,  0));
5668   rorxq(yz_idx2, yz_idx2, 32);
5669 
5670   mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
5671   mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
5672 
5673   movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
5674   rorxq(yz_idx1, yz_idx1, 32);
5675   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
5676   rorxq(yz_idx2, yz_idx2, 32);
5677 
5678   if (VM_Version::supports_adx()) {
5679     adcxq(tmp3, carry);
5680     adoxq(tmp3, yz_idx1);
5681 
5682     adcxq(tmp4, tmp);
5683     adoxq(tmp4, yz_idx2);
5684 
5685     movl(carry, 0); // does not affect flags
5686     adcxq(carry2, carry);
5687     adoxq(carry2, carry);
5688   } else {
5689     add2_with_carry(tmp4, tmp3, carry, yz_idx1);
5690     add2_with_carry(carry2, tmp4, tmp, yz_idx2);
5691   }
5692   movq(carry, carry2);
5693 
5694   movl(Address(z, idx, Address::times_4, 12), tmp3);
5695   shrq(tmp3, 32);
5696   movl(Address(z, idx, Address::times_4,  8), tmp3);
5697 
5698   movl(Address(z, idx, Address::times_4,  4), tmp4);
5699   shrq(tmp4, 32);
5700   movl(Address(z, idx, Address::times_4,  0), tmp4);
5701 
5702   jmp(L_third_loop);
5703 
5704   bind (L_third_loop_exit);
5705 
5706   andl (idx, 0x3);
5707   jcc(Assembler::zero, L_post_third_loop_done);
5708 
5709   Label L_check_1;
5710   subl(idx, 2);
5711   jcc(Assembler::negative, L_check_1);
5712 
5713   movq(yz_idx1, Address(y, idx, Address::times_4,  0));
5714   rorxq(yz_idx1, yz_idx1, 32);
5715   mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
5716   movq(yz_idx2, Address(z, idx, Address::times_4,  0));
5717   rorxq(yz_idx2, yz_idx2, 32);
5718 
5719   add2_with_carry(tmp4, tmp3, carry, yz_idx2);
5720 
5721   movl(Address(z, idx, Address::times_4,  4), tmp3);
5722   shrq(tmp3, 32);
5723   movl(Address(z, idx, Address::times_4,  0), tmp3);
5724   movq(carry, tmp4);
5725 
5726   bind (L_check_1);
5727   addl (idx, 0x2);
5728   andl (idx, 0x1);
5729   subl(idx, 1);
5730   jcc(Assembler::negative, L_post_third_loop_done);
5731   movl(tmp4, Address(y, idx, Address::times_4,  0));
5732   mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
5733   movl(tmp4, Address(z, idx, Address::times_4,  0));
5734 
5735   add2_with_carry(carry2, tmp3, tmp4, carry);
5736 
5737   movl(Address(z, idx, Address::times_4,  0), tmp3);
5738   shrq(tmp3, 32);
5739 
5740   shlq(carry2, 32);
5741   orq(tmp3, carry2);
5742   movq(carry, tmp3);
5743 
5744   bind(L_post_third_loop_done);
5745 }
5746 
5747 /**
5748  * Code for BigInteger::multiplyToLen() instrinsic.
5749  *
5750  * rdi: x
5751  * rax: xlen
5752  * rsi: y
5753  * rcx: ylen
5754  * r8:  z
5755  * r11: zlen
5756  * r12: tmp1
5757  * r13: tmp2
5758  * r14: tmp3
5759  * r15: tmp4
5760  * rbx: tmp5
5761  *
5762  */
5763 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
5764                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
5765   ShortBranchVerifier sbv(this);
5766   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
5767 
5768   push(tmp1);
5769   push(tmp2);
5770   push(tmp3);
5771   push(tmp4);
5772   push(tmp5);
5773 
5774   push(xlen);
5775   push(zlen);
5776 
5777   const Register idx = tmp1;
5778   const Register kdx = tmp2;
5779   const Register xstart = tmp3;
5780 
5781   const Register y_idx = tmp4;
5782   const Register carry = tmp5;
5783   const Register product  = xlen;
5784   const Register x_xstart = zlen;  // reuse register
5785 
5786   // First Loop.
5787   //
5788   //  final static long LONG_MASK = 0xffffffffL;
5789   //  int xstart = xlen - 1;
5790   //  int ystart = ylen - 1;
5791   //  long carry = 0;
5792   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
5793   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
5794   //    z[kdx] = (int)product;
5795   //    carry = product >>> 32;
5796   //  }
5797   //  z[xstart] = (int)carry;
5798   //
5799 
5800   movl(idx, ylen);      // idx = ylen;
5801   movl(kdx, zlen);      // kdx = xlen+ylen;
5802   xorq(carry, carry);   // carry = 0;
5803 
5804   Label L_done;
5805 
5806   movl(xstart, xlen);
5807   decrementl(xstart);
5808   jcc(Assembler::negative, L_done);
5809 
5810   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
5811 
5812   Label L_second_loop;
5813   testl(kdx, kdx);
5814   jcc(Assembler::zero, L_second_loop);
5815 
5816   Label L_carry;
5817   subl(kdx, 1);
5818   jcc(Assembler::zero, L_carry);
5819 
5820   movl(Address(z, kdx, Address::times_4,  0), carry);
5821   shrq(carry, 32);
5822   subl(kdx, 1);
5823 
5824   bind(L_carry);
5825   movl(Address(z, kdx, Address::times_4,  0), carry);
5826 
5827   // Second and third (nested) loops.
5828   //
5829   // for (int i = xstart-1; i >= 0; i--) { // Second loop
5830   //   carry = 0;
5831   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
5832   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
5833   //                    (z[k] & LONG_MASK) + carry;
5834   //     z[k] = (int)product;
5835   //     carry = product >>> 32;
5836   //   }
5837   //   z[i] = (int)carry;
5838   // }
5839   //
5840   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
5841 
5842   const Register jdx = tmp1;
5843 
5844   bind(L_second_loop);
5845   xorl(carry, carry);    // carry = 0;
5846   movl(jdx, ylen);       // j = ystart+1
5847 
5848   subl(xstart, 1);       // i = xstart-1;
5849   jcc(Assembler::negative, L_done);
5850 
5851   push (z);
5852 
5853   Label L_last_x;
5854   lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
5855   subl(xstart, 1);       // i = xstart-1;
5856   jcc(Assembler::negative, L_last_x);
5857 
5858   if (UseBMI2Instructions) {
5859     movq(rdx,  Address(x, xstart, Address::times_4,  0));
5860     rorxq(rdx, rdx, 32); // convert big-endian to little-endian
5861   } else {
5862     movq(x_xstart, Address(x, xstart, Address::times_4,  0));
5863     rorq(x_xstart, 32);  // convert big-endian to little-endian
5864   }
5865 
5866   Label L_third_loop_prologue;
5867   bind(L_third_loop_prologue);
5868 
5869   push (x);
5870   push (xstart);
5871   push (ylen);
5872 
5873 
5874   if (UseBMI2Instructions) {
5875     multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
5876   } else { // !UseBMI2Instructions
5877     multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
5878   }
5879 
5880   pop(ylen);
5881   pop(xlen);
5882   pop(x);
5883   pop(z);
5884 
5885   movl(tmp3, xlen);
5886   addl(tmp3, 1);
5887   movl(Address(z, tmp3, Address::times_4,  0), carry);
5888   subl(tmp3, 1);
5889   jccb(Assembler::negative, L_done);
5890 
5891   shrq(carry, 32);
5892   movl(Address(z, tmp3, Address::times_4,  0), carry);
5893   jmp(L_second_loop);
5894 
5895   // Next infrequent code is moved outside loops.
5896   bind(L_last_x);
5897   if (UseBMI2Instructions) {
5898     movl(rdx, Address(x,  0));
5899   } else {
5900     movl(x_xstart, Address(x,  0));
5901   }
5902   jmp(L_third_loop_prologue);
5903 
5904   bind(L_done);
5905 
5906   pop(zlen);
5907   pop(xlen);
5908 
5909   pop(tmp5);
5910   pop(tmp4);
5911   pop(tmp3);
5912   pop(tmp2);
5913   pop(tmp1);
5914 }
5915 
5916 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
5917   Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
5918   assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
5919   Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
5920   Label VECTOR8_TAIL, VECTOR4_TAIL;
5921   Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
5922   Label SAME_TILL_END, DONE;
5923   Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
5924 
5925   //scale is in rcx in both Win64 and Unix
5926   ShortBranchVerifier sbv(this);
5927 
5928   shlq(length);
5929   xorq(result, result);
5930 
5931   if ((AVX3Threshold == 0) && (UseAVX > 2) &&
5932       VM_Version::supports_avx512vlbw()) {
5933     Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL;
5934 
5935     cmpq(length, 64);
5936     jcc(Assembler::less, VECTOR32_TAIL);
5937 
5938     movq(tmp1, length);
5939     andq(tmp1, 0x3F);      // tail count
5940     andq(length, ~(0x3F)); //vector count
5941 
5942     bind(VECTOR64_LOOP);
5943     // AVX512 code to compare 64 byte vectors.
5944     evmovdqub(rymm0, Address(obja, result), false, Assembler::AVX_512bit);
5945     evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
5946     kortestql(k7, k7);
5947     jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL);     // mismatch
5948     addq(result, 64);
5949     subq(length, 64);
5950     jccb(Assembler::notZero, VECTOR64_LOOP);
5951 
5952     //bind(VECTOR64_TAIL);
5953     testq(tmp1, tmp1);
5954     jcc(Assembler::zero, SAME_TILL_END);
5955 
5956     //bind(VECTOR64_TAIL);
5957     // AVX512 code to compare upto 63 byte vectors.
5958     mov64(tmp2, 0xFFFFFFFFFFFFFFFF);
5959     shlxq(tmp2, tmp2, tmp1);
5960     notq(tmp2);
5961     kmovql(k3, tmp2);
5962 
5963     evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit);
5964     evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit);
5965 
5966     ktestql(k7, k3);
5967     jcc(Assembler::below, SAME_TILL_END);     // not mismatch
5968 
5969     bind(VECTOR64_NOT_EQUAL);
5970     kmovql(tmp1, k7);
5971     notq(tmp1);
5972     tzcntq(tmp1, tmp1);
5973     addq(result, tmp1);
5974     shrq(result);
5975     jmp(DONE);
5976     bind(VECTOR32_TAIL);
5977   }
5978 
5979   cmpq(length, 8);
5980   jcc(Assembler::equal, VECTOR8_LOOP);
5981   jcc(Assembler::less, VECTOR4_TAIL);
5982 
5983   if (UseAVX >= 2) {
5984     Label VECTOR16_TAIL, VECTOR32_LOOP;
5985 
5986     cmpq(length, 16);
5987     jcc(Assembler::equal, VECTOR16_LOOP);
5988     jcc(Assembler::less, VECTOR8_LOOP);
5989 
5990     cmpq(length, 32);
5991     jccb(Assembler::less, VECTOR16_TAIL);
5992 
5993     subq(length, 32);
5994     bind(VECTOR32_LOOP);
5995     vmovdqu(rymm0, Address(obja, result));
5996     vmovdqu(rymm1, Address(objb, result));
5997     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
5998     vptest(rymm2, rymm2);
5999     jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
6000     addq(result, 32);
6001     subq(length, 32);
6002     jcc(Assembler::greaterEqual, VECTOR32_LOOP);
6003     addq(length, 32);
6004     jcc(Assembler::equal, SAME_TILL_END);
6005     //falling through if less than 32 bytes left //close the branch here.
6006 
6007     bind(VECTOR16_TAIL);
6008     cmpq(length, 16);
6009     jccb(Assembler::less, VECTOR8_TAIL);
6010     bind(VECTOR16_LOOP);
6011     movdqu(rymm0, Address(obja, result));
6012     movdqu(rymm1, Address(objb, result));
6013     vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
6014     ptest(rymm2, rymm2);
6015     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
6016     addq(result, 16);
6017     subq(length, 16);
6018     jcc(Assembler::equal, SAME_TILL_END);
6019     //falling through if less than 16 bytes left
6020   } else {//regular intrinsics
6021 
6022     cmpq(length, 16);
6023     jccb(Assembler::less, VECTOR8_TAIL);
6024 
6025     subq(length, 16);
6026     bind(VECTOR16_LOOP);
6027     movdqu(rymm0, Address(obja, result));
6028     movdqu(rymm1, Address(objb, result));
6029     pxor(rymm0, rymm1);
6030     ptest(rymm0, rymm0);
6031     jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
6032     addq(result, 16);
6033     subq(length, 16);
6034     jccb(Assembler::greaterEqual, VECTOR16_LOOP);
6035     addq(length, 16);
6036     jcc(Assembler::equal, SAME_TILL_END);
6037     //falling through if less than 16 bytes left
6038   }
6039 
6040   bind(VECTOR8_TAIL);
6041   cmpq(length, 8);
6042   jccb(Assembler::less, VECTOR4_TAIL);
6043   bind(VECTOR8_LOOP);
6044   movq(tmp1, Address(obja, result));
6045   movq(tmp2, Address(objb, result));
6046   xorq(tmp1, tmp2);
6047   testq(tmp1, tmp1);
6048   jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
6049   addq(result, 8);
6050   subq(length, 8);
6051   jcc(Assembler::equal, SAME_TILL_END);
6052   //falling through if less than 8 bytes left
6053 
6054   bind(VECTOR4_TAIL);
6055   cmpq(length, 4);
6056   jccb(Assembler::less, BYTES_TAIL);
6057   bind(VECTOR4_LOOP);
6058   movl(tmp1, Address(obja, result));
6059   xorl(tmp1, Address(objb, result));
6060   testl(tmp1, tmp1);
6061   jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
6062   addq(result, 4);
6063   subq(length, 4);
6064   jcc(Assembler::equal, SAME_TILL_END);
6065   //falling through if less than 4 bytes left
6066 
6067   bind(BYTES_TAIL);
6068   bind(BYTES_LOOP);
6069   load_unsigned_byte(tmp1, Address(obja, result));
6070   load_unsigned_byte(tmp2, Address(objb, result));
6071   xorl(tmp1, tmp2);
6072   testl(tmp1, tmp1);
6073   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6074   decq(length);
6075   jcc(Assembler::zero, SAME_TILL_END);
6076   incq(result);
6077   load_unsigned_byte(tmp1, Address(obja, result));
6078   load_unsigned_byte(tmp2, Address(objb, result));
6079   xorl(tmp1, tmp2);
6080   testl(tmp1, tmp1);
6081   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6082   decq(length);
6083   jcc(Assembler::zero, SAME_TILL_END);
6084   incq(result);
6085   load_unsigned_byte(tmp1, Address(obja, result));
6086   load_unsigned_byte(tmp2, Address(objb, result));
6087   xorl(tmp1, tmp2);
6088   testl(tmp1, tmp1);
6089   jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
6090   jmp(SAME_TILL_END);
6091 
6092   if (UseAVX >= 2) {
6093     bind(VECTOR32_NOT_EQUAL);
6094     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
6095     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
6096     vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
6097     vpmovmskb(tmp1, rymm0);
6098     bsfq(tmp1, tmp1);
6099     addq(result, tmp1);
6100     shrq(result);
6101     jmp(DONE);
6102   }
6103 
6104   bind(VECTOR16_NOT_EQUAL);
6105   if (UseAVX >= 2) {
6106     vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
6107     vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
6108     pxor(rymm0, rymm2);
6109   } else {
6110     pcmpeqb(rymm2, rymm2);
6111     pxor(rymm0, rymm1);
6112     pcmpeqb(rymm0, rymm1);
6113     pxor(rymm0, rymm2);
6114   }
6115   pmovmskb(tmp1, rymm0);
6116   bsfq(tmp1, tmp1);
6117   addq(result, tmp1);
6118   shrq(result);
6119   jmpb(DONE);
6120 
6121   bind(VECTOR8_NOT_EQUAL);
6122   bind(VECTOR4_NOT_EQUAL);
6123   bsfq(tmp1, tmp1);
6124   shrq(tmp1, 3);
6125   addq(result, tmp1);
6126   bind(BYTES_NOT_EQUAL);
6127   shrq(result);
6128   jmpb(DONE);
6129 
6130   bind(SAME_TILL_END);
6131   mov64(result, -1);
6132 
6133   bind(DONE);
6134 }
6135 
6136 //Helper functions for square_to_len()
6137 
6138 /**
6139  * Store the squares of x[], right shifted one bit (divided by 2) into z[]
6140  * Preserves x and z and modifies rest of the registers.
6141  */
6142 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6143   // Perform square and right shift by 1
6144   // Handle odd xlen case first, then for even xlen do the following
6145   // jlong carry = 0;
6146   // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
6147   //     huge_128 product = x[j:j+1] * x[j:j+1];
6148   //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
6149   //     z[i+2:i+3] = (jlong)(product >>> 1);
6150   //     carry = (jlong)product;
6151   // }
6152 
6153   xorq(tmp5, tmp5);     // carry
6154   xorq(rdxReg, rdxReg);
6155   xorl(tmp1, tmp1);     // index for x
6156   xorl(tmp4, tmp4);     // index for z
6157 
6158   Label L_first_loop, L_first_loop_exit;
6159 
6160   testl(xlen, 1);
6161   jccb(Assembler::zero, L_first_loop); //jump if xlen is even
6162 
6163   // Square and right shift by 1 the odd element using 32 bit multiply
6164   movl(raxReg, Address(x, tmp1, Address::times_4, 0));
6165   imulq(raxReg, raxReg);
6166   shrq(raxReg, 1);
6167   adcq(tmp5, 0);
6168   movq(Address(z, tmp4, Address::times_4, 0), raxReg);
6169   incrementl(tmp1);
6170   addl(tmp4, 2);
6171 
6172   // Square and  right shift by 1 the rest using 64 bit multiply
6173   bind(L_first_loop);
6174   cmpptr(tmp1, xlen);
6175   jccb(Assembler::equal, L_first_loop_exit);
6176 
6177   // Square
6178   movq(raxReg, Address(x, tmp1, Address::times_4,  0));
6179   rorq(raxReg, 32);    // convert big-endian to little-endian
6180   mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
6181 
6182   // Right shift by 1 and save carry
6183   shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
6184   rcrq(rdxReg, 1);
6185   rcrq(raxReg, 1);
6186   adcq(tmp5, 0);
6187 
6188   // Store result in z
6189   movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
6190   movq(Address(z, tmp4, Address::times_4, 8), raxReg);
6191 
6192   // Update indices for x and z
6193   addl(tmp1, 2);
6194   addl(tmp4, 4);
6195   jmp(L_first_loop);
6196 
6197   bind(L_first_loop_exit);
6198 }
6199 
6200 
6201 /**
6202  * Perform the following multiply add operation using BMI2 instructions
6203  * carry:sum = sum + op1*op2 + carry
6204  * op2 should be in rdx
6205  * op2 is preserved, all other registers are modified
6206  */
6207 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
6208   // assert op2 is rdx
6209   mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
6210   addq(sum, carry);
6211   adcq(tmp2, 0);
6212   addq(sum, op1);
6213   adcq(tmp2, 0);
6214   movq(carry, tmp2);
6215 }
6216 
6217 /**
6218  * Perform the following multiply add operation:
6219  * carry:sum = sum + op1*op2 + carry
6220  * Preserves op1, op2 and modifies rest of registers
6221  */
6222 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
6223   // rdx:rax = op1 * op2
6224   movq(raxReg, op2);
6225   mulq(op1);
6226 
6227   //  rdx:rax = sum + carry + rdx:rax
6228   addq(sum, carry);
6229   adcq(rdxReg, 0);
6230   addq(sum, raxReg);
6231   adcq(rdxReg, 0);
6232 
6233   // carry:sum = rdx:sum
6234   movq(carry, rdxReg);
6235 }
6236 
6237 /**
6238  * Add 64 bit long carry into z[] with carry propogation.
6239  * Preserves z and carry register values and modifies rest of registers.
6240  *
6241  */
6242 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
6243   Label L_fourth_loop, L_fourth_loop_exit;
6244 
6245   movl(tmp1, 1);
6246   subl(zlen, 2);
6247   addq(Address(z, zlen, Address::times_4, 0), carry);
6248 
6249   bind(L_fourth_loop);
6250   jccb(Assembler::carryClear, L_fourth_loop_exit);
6251   subl(zlen, 2);
6252   jccb(Assembler::negative, L_fourth_loop_exit);
6253   addq(Address(z, zlen, Address::times_4, 0), tmp1);
6254   jmp(L_fourth_loop);
6255   bind(L_fourth_loop_exit);
6256 }
6257 
6258 /**
6259  * Shift z[] left by 1 bit.
6260  * Preserves x, len, z and zlen registers and modifies rest of the registers.
6261  *
6262  */
6263 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
6264 
6265   Label L_fifth_loop, L_fifth_loop_exit;
6266 
6267   // Fifth loop
6268   // Perform primitiveLeftShift(z, zlen, 1)
6269 
6270   const Register prev_carry = tmp1;
6271   const Register new_carry = tmp4;
6272   const Register value = tmp2;
6273   const Register zidx = tmp3;
6274 
6275   // int zidx, carry;
6276   // long value;
6277   // carry = 0;
6278   // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
6279   //    (carry:value)  = (z[i] << 1) | carry ;
6280   //    z[i] = value;
6281   // }
6282 
6283   movl(zidx, zlen);
6284   xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
6285 
6286   bind(L_fifth_loop);
6287   decl(zidx);  // Use decl to preserve carry flag
6288   decl(zidx);
6289   jccb(Assembler::negative, L_fifth_loop_exit);
6290 
6291   if (UseBMI2Instructions) {
6292      movq(value, Address(z, zidx, Address::times_4, 0));
6293      rclq(value, 1);
6294      rorxq(value, value, 32);
6295      movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
6296   }
6297   else {
6298     // clear new_carry
6299     xorl(new_carry, new_carry);
6300 
6301     // Shift z[i] by 1, or in previous carry and save new carry
6302     movq(value, Address(z, zidx, Address::times_4, 0));
6303     shlq(value, 1);
6304     adcl(new_carry, 0);
6305 
6306     orq(value, prev_carry);
6307     rorq(value, 0x20);
6308     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
6309 
6310     // Set previous carry = new carry
6311     movl(prev_carry, new_carry);
6312   }
6313   jmp(L_fifth_loop);
6314 
6315   bind(L_fifth_loop_exit);
6316 }
6317 
6318 
6319 /**
6320  * Code for BigInteger::squareToLen() intrinsic
6321  *
6322  * rdi: x
6323  * rsi: len
6324  * r8:  z
6325  * rcx: zlen
6326  * r12: tmp1
6327  * r13: tmp2
6328  * r14: tmp3
6329  * r15: tmp4
6330  * rbx: tmp5
6331  *
6332  */
6333 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6334 
6335   Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply;
6336   push(tmp1);
6337   push(tmp2);
6338   push(tmp3);
6339   push(tmp4);
6340   push(tmp5);
6341 
6342   // First loop
6343   // Store the squares, right shifted one bit (i.e., divided by 2).
6344   square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
6345 
6346   // Add in off-diagonal sums.
6347   //
6348   // Second, third (nested) and fourth loops.
6349   // zlen +=2;
6350   // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
6351   //    carry = 0;
6352   //    long op2 = x[xidx:xidx+1];
6353   //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
6354   //       k -= 2;
6355   //       long op1 = x[j:j+1];
6356   //       long sum = z[k:k+1];
6357   //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
6358   //       z[k:k+1] = sum;
6359   //    }
6360   //    add_one_64(z, k, carry, tmp_regs);
6361   // }
6362 
6363   const Register carry = tmp5;
6364   const Register sum = tmp3;
6365   const Register op1 = tmp4;
6366   Register op2 = tmp2;
6367 
6368   push(zlen);
6369   push(len);
6370   addl(zlen,2);
6371   bind(L_second_loop);
6372   xorq(carry, carry);
6373   subl(zlen, 4);
6374   subl(len, 2);
6375   push(zlen);
6376   push(len);
6377   cmpl(len, 0);
6378   jccb(Assembler::lessEqual, L_second_loop_exit);
6379 
6380   // Multiply an array by one 64 bit long.
6381   if (UseBMI2Instructions) {
6382     op2 = rdxReg;
6383     movq(op2, Address(x, len, Address::times_4,  0));
6384     rorxq(op2, op2, 32);
6385   }
6386   else {
6387     movq(op2, Address(x, len, Address::times_4,  0));
6388     rorq(op2, 32);
6389   }
6390 
6391   bind(L_third_loop);
6392   decrementl(len);
6393   jccb(Assembler::negative, L_third_loop_exit);
6394   decrementl(len);
6395   jccb(Assembler::negative, L_last_x);
6396 
6397   movq(op1, Address(x, len, Address::times_4,  0));
6398   rorq(op1, 32);
6399 
6400   bind(L_multiply);
6401   subl(zlen, 2);
6402   movq(sum, Address(z, zlen, Address::times_4,  0));
6403 
6404   // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
6405   if (UseBMI2Instructions) {
6406     multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
6407   }
6408   else {
6409     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6410   }
6411 
6412   movq(Address(z, zlen, Address::times_4, 0), sum);
6413 
6414   jmp(L_third_loop);
6415   bind(L_third_loop_exit);
6416 
6417   // Fourth loop
6418   // Add 64 bit long carry into z with carry propogation.
6419   // Uses offsetted zlen.
6420   add_one_64(z, zlen, carry, tmp1);
6421 
6422   pop(len);
6423   pop(zlen);
6424   jmp(L_second_loop);
6425 
6426   // Next infrequent code is moved outside loops.
6427   bind(L_last_x);
6428   movl(op1, Address(x, 0));
6429   jmp(L_multiply);
6430 
6431   bind(L_second_loop_exit);
6432   pop(len);
6433   pop(zlen);
6434   pop(len);
6435   pop(zlen);
6436 
6437   // Fifth loop
6438   // Shift z left 1 bit.
6439   lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
6440 
6441   // z[zlen-1] |= x[len-1] & 1;
6442   movl(tmp3, Address(x, len, Address::times_4, -4));
6443   andl(tmp3, 1);
6444   orl(Address(z, zlen, Address::times_4,  -4), tmp3);
6445 
6446   pop(tmp5);
6447   pop(tmp4);
6448   pop(tmp3);
6449   pop(tmp2);
6450   pop(tmp1);
6451 }
6452 
6453 /**
6454  * Helper function for mul_add()
6455  * Multiply the in[] by int k and add to out[] starting at offset offs using
6456  * 128 bit by 32 bit multiply and return the carry in tmp5.
6457  * Only quad int aligned length of in[] is operated on in this function.
6458  * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
6459  * This function preserves out, in and k registers.
6460  * len and offset point to the appropriate index in "in" & "out" correspondingly
6461  * tmp5 has the carry.
6462  * other registers are temporary and are modified.
6463  *
6464  */
6465 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
6466   Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
6467   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6468 
6469   Label L_first_loop, L_first_loop_exit;
6470 
6471   movl(tmp1, len);
6472   shrl(tmp1, 2);
6473 
6474   bind(L_first_loop);
6475   subl(tmp1, 1);
6476   jccb(Assembler::negative, L_first_loop_exit);
6477 
6478   subl(len, 4);
6479   subl(offset, 4);
6480 
6481   Register op2 = tmp2;
6482   const Register sum = tmp3;
6483   const Register op1 = tmp4;
6484   const Register carry = tmp5;
6485 
6486   if (UseBMI2Instructions) {
6487     op2 = rdxReg;
6488   }
6489 
6490   movq(op1, Address(in, len, Address::times_4,  8));
6491   rorq(op1, 32);
6492   movq(sum, Address(out, offset, Address::times_4,  8));
6493   rorq(sum, 32);
6494   if (UseBMI2Instructions) {
6495     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6496   }
6497   else {
6498     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6499   }
6500   // Store back in big endian from little endian
6501   rorq(sum, 0x20);
6502   movq(Address(out, offset, Address::times_4,  8), sum);
6503 
6504   movq(op1, Address(in, len, Address::times_4,  0));
6505   rorq(op1, 32);
6506   movq(sum, Address(out, offset, Address::times_4,  0));
6507   rorq(sum, 32);
6508   if (UseBMI2Instructions) {
6509     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6510   }
6511   else {
6512     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6513   }
6514   // Store back in big endian from little endian
6515   rorq(sum, 0x20);
6516   movq(Address(out, offset, Address::times_4,  0), sum);
6517 
6518   jmp(L_first_loop);
6519   bind(L_first_loop_exit);
6520 }
6521 
6522 /**
6523  * Code for BigInteger::mulAdd() intrinsic
6524  *
6525  * rdi: out
6526  * rsi: in
6527  * r11: offs (out.length - offset)
6528  * rcx: len
6529  * r8:  k
6530  * r12: tmp1
6531  * r13: tmp2
6532  * r14: tmp3
6533  * r15: tmp4
6534  * rbx: tmp5
6535  * Multiply the in[] by word k and add to out[], return the carry in rax
6536  */
6537 void MacroAssembler::mul_add(Register out, Register in, Register offs,
6538    Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
6539    Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
6540 
6541   Label L_carry, L_last_in, L_done;
6542 
6543 // carry = 0;
6544 // for (int j=len-1; j >= 0; j--) {
6545 //    long product = (in[j] & LONG_MASK) * kLong +
6546 //                   (out[offs] & LONG_MASK) + carry;
6547 //    out[offs--] = (int)product;
6548 //    carry = product >>> 32;
6549 // }
6550 //
6551   push(tmp1);
6552   push(tmp2);
6553   push(tmp3);
6554   push(tmp4);
6555   push(tmp5);
6556 
6557   Register op2 = tmp2;
6558   const Register sum = tmp3;
6559   const Register op1 = tmp4;
6560   const Register carry =  tmp5;
6561 
6562   if (UseBMI2Instructions) {
6563     op2 = rdxReg;
6564     movl(op2, k);
6565   }
6566   else {
6567     movl(op2, k);
6568   }
6569 
6570   xorq(carry, carry);
6571 
6572   //First loop
6573 
6574   //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
6575   //The carry is in tmp5
6576   mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
6577 
6578   //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
6579   decrementl(len);
6580   jccb(Assembler::negative, L_carry);
6581   decrementl(len);
6582   jccb(Assembler::negative, L_last_in);
6583 
6584   movq(op1, Address(in, len, Address::times_4,  0));
6585   rorq(op1, 32);
6586 
6587   subl(offs, 2);
6588   movq(sum, Address(out, offs, Address::times_4,  0));
6589   rorq(sum, 32);
6590 
6591   if (UseBMI2Instructions) {
6592     multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
6593   }
6594   else {
6595     multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
6596   }
6597 
6598   // Store back in big endian from little endian
6599   rorq(sum, 0x20);
6600   movq(Address(out, offs, Address::times_4,  0), sum);
6601 
6602   testl(len, len);
6603   jccb(Assembler::zero, L_carry);
6604 
6605   //Multiply the last in[] entry, if any
6606   bind(L_last_in);
6607   movl(op1, Address(in, 0));
6608   movl(sum, Address(out, offs, Address::times_4,  -4));
6609 
6610   movl(raxReg, k);
6611   mull(op1); //tmp4 * eax -> edx:eax
6612   addl(sum, carry);
6613   adcl(rdxReg, 0);
6614   addl(sum, raxReg);
6615   adcl(rdxReg, 0);
6616   movl(carry, rdxReg);
6617 
6618   movl(Address(out, offs, Address::times_4,  -4), sum);
6619 
6620   bind(L_carry);
6621   //return tmp5/carry as carry in rax
6622   movl(rax, carry);
6623 
6624   bind(L_done);
6625   pop(tmp5);
6626   pop(tmp4);
6627   pop(tmp3);
6628   pop(tmp2);
6629   pop(tmp1);
6630 }
6631 #endif
6632 
6633 /**
6634  * Emits code to update CRC-32 with a byte value according to constants in table
6635  *
6636  * @param [in,out]crc   Register containing the crc.
6637  * @param [in]val       Register containing the byte to fold into the CRC.
6638  * @param [in]table     Register containing the table of crc constants.
6639  *
6640  * uint32_t crc;
6641  * val = crc_table[(val ^ crc) & 0xFF];
6642  * crc = val ^ (crc >> 8);
6643  *
6644  */
6645 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
6646   xorl(val, crc);
6647   andl(val, 0xFF);
6648   shrl(crc, 8); // unsigned shift
6649   xorl(crc, Address(table, val, Address::times_4, 0));
6650 }
6651 
6652 /**
6653  * Fold 128-bit data chunk
6654  */
6655 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
6656   if (UseAVX > 0) {
6657     vpclmulhdq(xtmp, xK, xcrc); // [123:64]
6658     vpclmulldq(xcrc, xK, xcrc); // [63:0]
6659     vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
6660     pxor(xcrc, xtmp);
6661   } else {
6662     movdqa(xtmp, xcrc);
6663     pclmulhdq(xtmp, xK);   // [123:64]
6664     pclmulldq(xcrc, xK);   // [63:0]
6665     pxor(xcrc, xtmp);
6666     movdqu(xtmp, Address(buf, offset));
6667     pxor(xcrc, xtmp);
6668   }
6669 }
6670 
6671 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
6672   if (UseAVX > 0) {
6673     vpclmulhdq(xtmp, xK, xcrc);
6674     vpclmulldq(xcrc, xK, xcrc);
6675     pxor(xcrc, xbuf);
6676     pxor(xcrc, xtmp);
6677   } else {
6678     movdqa(xtmp, xcrc);
6679     pclmulhdq(xtmp, xK);
6680     pclmulldq(xcrc, xK);
6681     pxor(xcrc, xbuf);
6682     pxor(xcrc, xtmp);
6683   }
6684 }
6685 
6686 /**
6687  * 8-bit folds to compute 32-bit CRC
6688  *
6689  * uint64_t xcrc;
6690  * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
6691  */
6692 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
6693   movdl(tmp, xcrc);
6694   andl(tmp, 0xFF);
6695   movdl(xtmp, Address(table, tmp, Address::times_4, 0));
6696   psrldq(xcrc, 1); // unsigned shift one byte
6697   pxor(xcrc, xtmp);
6698 }
6699 
6700 /**
6701  * uint32_t crc;
6702  * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
6703  */
6704 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
6705   movl(tmp, crc);
6706   andl(tmp, 0xFF);
6707   shrl(crc, 8);
6708   xorl(crc, Address(table, tmp, Address::times_4, 0));
6709 }
6710 
6711 /**
6712  * @param crc   register containing existing CRC (32-bit)
6713  * @param buf   register pointing to input byte buffer (byte*)
6714  * @param len   register containing number of bytes
6715  * @param table register that will contain address of CRC table
6716  * @param tmp   scratch register
6717  */
6718 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
6719   assert_different_registers(crc, buf, len, table, tmp, rax);
6720 
6721   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
6722   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
6723 
6724   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
6725   // context for the registers used, where all instructions below are using 128-bit mode
6726   // On EVEX without VL and BW, these instructions will all be AVX.
6727   lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
6728   notl(crc); // ~crc
6729   cmpl(len, 16);
6730   jcc(Assembler::less, L_tail);
6731 
6732   // Align buffer to 16 bytes
6733   movl(tmp, buf);
6734   andl(tmp, 0xF);
6735   jccb(Assembler::zero, L_aligned);
6736   subl(tmp,  16);
6737   addl(len, tmp);
6738 
6739   align(4);
6740   BIND(L_align_loop);
6741   movsbl(rax, Address(buf, 0)); // load byte with sign extension
6742   update_byte_crc32(crc, rax, table);
6743   increment(buf);
6744   incrementl(tmp);
6745   jccb(Assembler::less, L_align_loop);
6746 
6747   BIND(L_aligned);
6748   movl(tmp, len); // save
6749   shrl(len, 4);
6750   jcc(Assembler::zero, L_tail_restore);
6751 
6752   // Fold crc into first bytes of vector
6753   movdqa(xmm1, Address(buf, 0));
6754   movdl(rax, xmm1);
6755   xorl(crc, rax);
6756   if (VM_Version::supports_sse4_1()) {
6757     pinsrd(xmm1, crc, 0);
6758   } else {
6759     pinsrw(xmm1, crc, 0);
6760     shrl(crc, 16);
6761     pinsrw(xmm1, crc, 1);
6762   }
6763   addptr(buf, 16);
6764   subl(len, 4); // len > 0
6765   jcc(Assembler::less, L_fold_tail);
6766 
6767   movdqa(xmm2, Address(buf,  0));
6768   movdqa(xmm3, Address(buf, 16));
6769   movdqa(xmm4, Address(buf, 32));
6770   addptr(buf, 48);
6771   subl(len, 3);
6772   jcc(Assembler::lessEqual, L_fold_512b);
6773 
6774   // Fold total 512 bits of polynomial on each iteration,
6775   // 128 bits per each of 4 parallel streams.
6776   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
6777 
6778   align(32);
6779   BIND(L_fold_512b_loop);
6780   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
6781   fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
6782   fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
6783   fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
6784   addptr(buf, 64);
6785   subl(len, 4);
6786   jcc(Assembler::greater, L_fold_512b_loop);
6787 
6788   // Fold 512 bits to 128 bits.
6789   BIND(L_fold_512b);
6790   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
6791   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
6792   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
6793   fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
6794 
6795   // Fold the rest of 128 bits data chunks
6796   BIND(L_fold_tail);
6797   addl(len, 3);
6798   jccb(Assembler::lessEqual, L_fold_128b);
6799   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
6800 
6801   BIND(L_fold_tail_loop);
6802   fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
6803   addptr(buf, 16);
6804   decrementl(len);
6805   jccb(Assembler::greater, L_fold_tail_loop);
6806 
6807   // Fold 128 bits in xmm1 down into 32 bits in crc register.
6808   BIND(L_fold_128b);
6809   movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
6810   if (UseAVX > 0) {
6811     vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
6812     vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
6813     vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
6814   } else {
6815     movdqa(xmm2, xmm0);
6816     pclmulqdq(xmm2, xmm1, 0x1);
6817     movdqa(xmm3, xmm0);
6818     pand(xmm3, xmm2);
6819     pclmulqdq(xmm0, xmm3, 0x1);
6820   }
6821   psrldq(xmm1, 8);
6822   psrldq(xmm2, 4);
6823   pxor(xmm0, xmm1);
6824   pxor(xmm0, xmm2);
6825 
6826   // 8 8-bit folds to compute 32-bit CRC.
6827   for (int j = 0; j < 4; j++) {
6828     fold_8bit_crc32(xmm0, table, xmm1, rax);
6829   }
6830   movdl(crc, xmm0); // mov 32 bits to general register
6831   for (int j = 0; j < 4; j++) {
6832     fold_8bit_crc32(crc, table, rax);
6833   }
6834 
6835   BIND(L_tail_restore);
6836   movl(len, tmp); // restore
6837   BIND(L_tail);
6838   andl(len, 0xf);
6839   jccb(Assembler::zero, L_exit);
6840 
6841   // Fold the rest of bytes
6842   align(4);
6843   BIND(L_tail_loop);
6844   movsbl(rax, Address(buf, 0)); // load byte with sign extension
6845   update_byte_crc32(crc, rax, table);
6846   increment(buf);
6847   decrementl(len);
6848   jccb(Assembler::greater, L_tail_loop);
6849 
6850   BIND(L_exit);
6851   notl(crc); // ~c
6852 }
6853 
6854 #ifdef _LP64
6855 // Helper function for AVX 512 CRC32
6856 // Fold 512-bit data chunks
6857 void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf,
6858                                              Register pos, int offset) {
6859   evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit);
6860   evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64]
6861   evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0]
6862   evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */);
6863   evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */);
6864 }
6865 
6866 // Helper function for AVX 512 CRC32
6867 // Compute CRC32 for < 256B buffers
6868 void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register key, Register pos,
6869                                               Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
6870                                               Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
6871 
6872   Label L_less_than_32, L_exact_16_left, L_less_than_16_left;
6873   Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left;
6874   Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2;
6875 
6876   // check if there is enough buffer to be able to fold 16B at a time
6877   cmpl(len, 32);
6878   jcc(Assembler::less, L_less_than_32);
6879 
6880   // if there is, load the constants
6881   movdqu(xmm10, Address(key, 1 * 16));    //rk1 and rk2 in xmm10
6882   movdl(xmm0, crc);                        // get the initial crc value
6883   movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
6884   pxor(xmm7, xmm0);
6885 
6886   // update the buffer pointer
6887   addl(pos, 16);
6888   //update the counter.subtract 32 instead of 16 to save one instruction from the loop
6889   subl(len, 32);
6890   jmp(L_16B_reduction_loop);
6891 
6892   bind(L_less_than_32);
6893   //mov initial crc to the return value. this is necessary for zero - length buffers.
6894   movl(rax, crc);
6895   testl(len, len);
6896   jcc(Assembler::equal, L_cleanup);
6897 
6898   movdl(xmm0, crc);                        //get the initial crc value
6899 
6900   cmpl(len, 16);
6901   jcc(Assembler::equal, L_exact_16_left);
6902   jcc(Assembler::less, L_less_than_16_left);
6903 
6904   movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
6905   pxor(xmm7, xmm0);                       //xor the initial crc value
6906   addl(pos, 16);
6907   subl(len, 16);
6908   movdqu(xmm10, Address(key, 1 * 16));    // rk1 and rk2 in xmm10
6909   jmp(L_get_last_two_xmms);
6910 
6911   bind(L_less_than_16_left);
6912   //use stack space to load data less than 16 bytes, zero - out the 16B in memory first.
6913   pxor(xmm1, xmm1);
6914   movptr(tmp1, rsp);
6915   movdqu(Address(tmp1, 0 * 16), xmm1);
6916 
6917   cmpl(len, 4);
6918   jcc(Assembler::less, L_only_less_than_4);
6919 
6920   //backup the counter value
6921   movl(tmp2, len);
6922   cmpl(len, 8);
6923   jcc(Assembler::less, L_less_than_8_left);
6924 
6925   //load 8 Bytes
6926   movq(rax, Address(buf, pos, Address::times_1, 0 * 16));
6927   movq(Address(tmp1, 0 * 16), rax);
6928   addptr(tmp1, 8);
6929   subl(len, 8);
6930   addl(pos, 8);
6931 
6932   bind(L_less_than_8_left);
6933   cmpl(len, 4);
6934   jcc(Assembler::less, L_less_than_4_left);
6935 
6936   //load 4 Bytes
6937   movl(rax, Address(buf, pos, Address::times_1, 0));
6938   movl(Address(tmp1, 0 * 16), rax);
6939   addptr(tmp1, 4);
6940   subl(len, 4);
6941   addl(pos, 4);
6942 
6943   bind(L_less_than_4_left);
6944   cmpl(len, 2);
6945   jcc(Assembler::less, L_less_than_2_left);
6946 
6947   // load 2 Bytes
6948   movw(rax, Address(buf, pos, Address::times_1, 0));
6949   movl(Address(tmp1, 0 * 16), rax);
6950   addptr(tmp1, 2);
6951   subl(len, 2);
6952   addl(pos, 2);
6953 
6954   bind(L_less_than_2_left);
6955   cmpl(len, 1);
6956   jcc(Assembler::less, L_zero_left);
6957 
6958   // load 1 Byte
6959   movb(rax, Address(buf, pos, Address::times_1, 0));
6960   movb(Address(tmp1, 0 * 16), rax);
6961 
6962   bind(L_zero_left);
6963   movdqu(xmm7, Address(rsp, 0));
6964   pxor(xmm7, xmm0);                       //xor the initial crc value
6965 
6966   lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
6967   movdqu(xmm0, Address(rax, tmp2));
6968   pshufb(xmm7, xmm0);
6969   jmp(L_128_done);
6970 
6971   bind(L_exact_16_left);
6972   movdqu(xmm7, Address(buf, pos, Address::times_1, 0));
6973   pxor(xmm7, xmm0);                       //xor the initial crc value
6974   jmp(L_128_done);
6975 
6976   bind(L_only_less_than_4);
6977   cmpl(len, 3);
6978   jcc(Assembler::less, L_only_less_than_3);
6979 
6980   // load 3 Bytes
6981   movb(rax, Address(buf, pos, Address::times_1, 0));
6982   movb(Address(tmp1, 0), rax);
6983 
6984   movb(rax, Address(buf, pos, Address::times_1, 1));
6985   movb(Address(tmp1, 1), rax);
6986 
6987   movb(rax, Address(buf, pos, Address::times_1, 2));
6988   movb(Address(tmp1, 2), rax);
6989 
6990   movdqu(xmm7, Address(rsp, 0));
6991   pxor(xmm7, xmm0);                     //xor the initial crc value
6992 
6993   pslldq(xmm7, 0x5);
6994   jmp(L_barrett);
6995   bind(L_only_less_than_3);
6996   cmpl(len, 2);
6997   jcc(Assembler::less, L_only_less_than_2);
6998 
6999   // load 2 Bytes
7000   movb(rax, Address(buf, pos, Address::times_1, 0));
7001   movb(Address(tmp1, 0), rax);
7002 
7003   movb(rax, Address(buf, pos, Address::times_1, 1));
7004   movb(Address(tmp1, 1), rax);
7005 
7006   movdqu(xmm7, Address(rsp, 0));
7007   pxor(xmm7, xmm0);                     //xor the initial crc value
7008 
7009   pslldq(xmm7, 0x6);
7010   jmp(L_barrett);
7011 
7012   bind(L_only_less_than_2);
7013   //load 1 Byte
7014   movb(rax, Address(buf, pos, Address::times_1, 0));
7015   movb(Address(tmp1, 0), rax);
7016 
7017   movdqu(xmm7, Address(rsp, 0));
7018   pxor(xmm7, xmm0);                     //xor the initial crc value
7019 
7020   pslldq(xmm7, 0x7);
7021 }
7022 
7023 /**
7024 * Compute CRC32 using AVX512 instructions
7025 * param crc   register containing existing CRC (32-bit)
7026 * param buf   register pointing to input byte buffer (byte*)
7027 * param len   register containing number of bytes
7028 * param tmp1  scratch register
7029 * param tmp2  scratch register
7030 * return rax  result register
7031 */
7032 void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register key, Register tmp1, Register tmp2) {
7033   assert_different_registers(crc, buf, len, key, tmp1, tmp2, rax);
7034 
7035   Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7036   Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7037   Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop;
7038   Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop;
7039   Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup;
7040 
7041   const Register pos = r12;
7042   push(r12);
7043   subptr(rsp, 16 * 2 + 8);
7044 
7045   // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7046   // context for the registers used, where all instructions below are using 128-bit mode
7047   // On EVEX without VL and BW, these instructions will all be AVX.
7048   lea(key, ExternalAddress(StubRoutines::x86::crc_table_avx512_addr()));
7049   notl(crc);
7050   movl(pos, 0);
7051 
7052   // check if smaller than 256B
7053   cmpl(len, 256);
7054   jcc(Assembler::less, L_less_than_256);
7055 
7056   // load the initial crc value
7057   movdl(xmm10, crc);
7058 
7059   // receive the initial 64B data, xor the initial crc value
7060   evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
7061   evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
7062   evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
7063   evbroadcasti32x4(xmm10, Address(key, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
7064 
7065   subl(len, 256);
7066   cmpl(len, 256);
7067   jcc(Assembler::less, L_fold_128_B_loop);
7068 
7069   evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
7070   evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
7071   evbroadcasti32x4(xmm16, Address(key, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
7072   subl(len, 256);
7073 
7074   bind(L_fold_256_B_loop);
7075   addl(pos, 256);
7076   fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64);
7077   fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64);
7078   fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64);
7079   fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64);
7080 
7081   subl(len, 256);
7082   jcc(Assembler::greaterEqual, L_fold_256_B_loop);
7083 
7084   // Fold 256 into 128
7085   addl(pos, 256);
7086   evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit);
7087   evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit);
7088   vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC
7089 
7090   evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit);
7091   evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit);
7092   vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC
7093 
7094   evmovdquq(xmm0, xmm7, Assembler::AVX_512bit);
7095   evmovdquq(xmm4, xmm8, Assembler::AVX_512bit);
7096 
7097   addl(len, 128);
7098   jmp(L_fold_128_B_register);
7099 
7100   // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop
7101   // loop will fold 128B at a time until we have 128 + y Bytes of buffer
7102 
7103   // fold 128B at a time.This section of the code folds 8 xmm registers in parallel
7104   bind(L_fold_128_B_loop);
7105   addl(pos, 128);
7106   fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64);
7107   fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64);
7108 
7109   subl(len, 128);
7110   jcc(Assembler::greaterEqual, L_fold_128_B_loop);
7111 
7112   addl(pos, 128);
7113 
7114   // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
7115   // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
7116   bind(L_fold_128_B_register);
7117   evmovdquq(xmm16, Address(key, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
7118   evmovdquq(xmm11, Address(key, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
7119   evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
7120   evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit);
7121   // save last that has no multiplicand
7122   vextracti64x2(xmm7, xmm4, 3);
7123 
7124   evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit);
7125   evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit);
7126   // Needed later in reduction loop
7127   movdqu(xmm10, Address(key, 1 * 16));
7128   vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
7129   vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
7130 
7131   // Swap 1,0,3,2 - 01 00 11 10
7132   evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit);
7133   evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit);
7134   vextracti128(xmm5, xmm8, 1);
7135   evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit);
7136 
7137   // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop
7138   // instead of a cmp instruction, we use the negative flag with the jl instruction
7139   addl(len, 128 - 16);
7140   jcc(Assembler::less, L_final_reduction_for_128);
7141 
7142   bind(L_16B_reduction_loop);
7143   vpclmulqdq(xmm8, xmm7, xmm10, 0x1);
7144   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7145   vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7146   movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16));
7147   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7148   addl(pos, 16);
7149   subl(len, 16);
7150   jcc(Assembler::greaterEqual, L_16B_reduction_loop);
7151 
7152   bind(L_final_reduction_for_128);
7153   addl(len, 16);
7154   jcc(Assembler::equal, L_128_done);
7155 
7156   bind(L_get_last_two_xmms);
7157   movdqu(xmm2, xmm7);
7158   addl(pos, len);
7159   movdqu(xmm1, Address(buf, pos, Address::times_1, -16));
7160   subl(pos, len);
7161 
7162   // get rid of the extra data that was loaded before
7163   // load the shift constant
7164   lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
7165   movdqu(xmm0, Address(rax, len));
7166   addl(rax, len);
7167 
7168   vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7169   //Change mask to 512
7170   vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2);
7171   vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit);
7172 
7173   blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
7174   vpclmulqdq(xmm8, xmm7, xmm10, 0x1);
7175   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7176   vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7177   vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit);
7178 
7179   bind(L_128_done);
7180   // compute crc of a 128-bit value
7181   movdqu(xmm10, Address(key, 3 * 16));
7182   movdqu(xmm0, xmm7);
7183 
7184   // 64b fold
7185   vpclmulqdq(xmm7, xmm7, xmm10, 0x0);
7186   vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit);
7187   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7188 
7189   // 32b fold
7190   movdqu(xmm0, xmm7);
7191   vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit);
7192   vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
7193   vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit);
7194   jmp(L_barrett);
7195 
7196   bind(L_less_than_256);
7197   kernel_crc32_avx512_256B(crc, buf, len, key, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
7198 
7199   //barrett reduction
7200   bind(L_barrett);
7201   vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2);
7202   movdqu(xmm1, xmm7);
7203   movdqu(xmm2, xmm7);
7204   movdqu(xmm10, Address(key, 4 * 16));
7205 
7206   pclmulqdq(xmm7, xmm10, 0x0);
7207   pxor(xmm7, xmm2);
7208   vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2);
7209   movdqu(xmm2, xmm7);
7210   pclmulqdq(xmm7, xmm10, 0x10);
7211   pxor(xmm7, xmm2);
7212   pxor(xmm7, xmm1);
7213   pextrd(crc, xmm7, 2);
7214 
7215   bind(L_cleanup);
7216   notl(crc); // ~c
7217   addptr(rsp, 16 * 2 + 8);
7218   pop(r12);
7219 }
7220 
7221 // S. Gueron / Information Processing Letters 112 (2012) 184
7222 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
7223 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
7224 // Output: the 64-bit carry-less product of B * CONST
7225 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
7226                                      Register tmp1, Register tmp2, Register tmp3) {
7227   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
7228   if (n > 0) {
7229     addq(tmp3, n * 256 * 8);
7230   }
7231   //    Q1 = TABLEExt[n][B & 0xFF];
7232   movl(tmp1, in);
7233   andl(tmp1, 0x000000FF);
7234   shll(tmp1, 3);
7235   addq(tmp1, tmp3);
7236   movq(tmp1, Address(tmp1, 0));
7237 
7238   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
7239   movl(tmp2, in);
7240   shrl(tmp2, 8);
7241   andl(tmp2, 0x000000FF);
7242   shll(tmp2, 3);
7243   addq(tmp2, tmp3);
7244   movq(tmp2, Address(tmp2, 0));
7245 
7246   shlq(tmp2, 8);
7247   xorq(tmp1, tmp2);
7248 
7249   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
7250   movl(tmp2, in);
7251   shrl(tmp2, 16);
7252   andl(tmp2, 0x000000FF);
7253   shll(tmp2, 3);
7254   addq(tmp2, tmp3);
7255   movq(tmp2, Address(tmp2, 0));
7256 
7257   shlq(tmp2, 16);
7258   xorq(tmp1, tmp2);
7259 
7260   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
7261   shrl(in, 24);
7262   andl(in, 0x000000FF);
7263   shll(in, 3);
7264   addq(in, tmp3);
7265   movq(in, Address(in, 0));
7266 
7267   shlq(in, 24);
7268   xorq(in, tmp1);
7269   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
7270 }
7271 
7272 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
7273                                       Register in_out,
7274                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
7275                                       XMMRegister w_xtmp2,
7276                                       Register tmp1,
7277                                       Register n_tmp2, Register n_tmp3) {
7278   if (is_pclmulqdq_supported) {
7279     movdl(w_xtmp1, in_out); // modified blindly
7280 
7281     movl(tmp1, const_or_pre_comp_const_index);
7282     movdl(w_xtmp2, tmp1);
7283     pclmulqdq(w_xtmp1, w_xtmp2, 0);
7284 
7285     movdq(in_out, w_xtmp1);
7286   } else {
7287     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
7288   }
7289 }
7290 
7291 // Recombination Alternative 2: No bit-reflections
7292 // T1 = (CRC_A * U1) << 1
7293 // T2 = (CRC_B * U2) << 1
7294 // C1 = T1 >> 32
7295 // C2 = T2 >> 32
7296 // T1 = T1 & 0xFFFFFFFF
7297 // T2 = T2 & 0xFFFFFFFF
7298 // T1 = CRC32(0, T1)
7299 // T2 = CRC32(0, T2)
7300 // C1 = C1 ^ T1
7301 // C2 = C2 ^ T2
7302 // CRC = C1 ^ C2 ^ CRC_C
7303 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
7304                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7305                                      Register tmp1, Register tmp2,
7306                                      Register n_tmp3) {
7307   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7308   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7309   shlq(in_out, 1);
7310   movl(tmp1, in_out);
7311   shrq(in_out, 32);
7312   xorl(tmp2, tmp2);
7313   crc32(tmp2, tmp1, 4);
7314   xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
7315   shlq(in1, 1);
7316   movl(tmp1, in1);
7317   shrq(in1, 32);
7318   xorl(tmp2, tmp2);
7319   crc32(tmp2, tmp1, 4);
7320   xorl(in1, tmp2);
7321   xorl(in_out, in1);
7322   xorl(in_out, in2);
7323 }
7324 
7325 // Set N to predefined value
7326 // Subtract from a lenght of a buffer
7327 // execute in a loop:
7328 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
7329 // for i = 1 to N do
7330 //  CRC_A = CRC32(CRC_A, A[i])
7331 //  CRC_B = CRC32(CRC_B, B[i])
7332 //  CRC_C = CRC32(CRC_C, C[i])
7333 // end for
7334 // Recombine
7335 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
7336                                        Register in_out1, Register in_out2, Register in_out3,
7337                                        Register tmp1, Register tmp2, Register tmp3,
7338                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7339                                        Register tmp4, Register tmp5,
7340                                        Register n_tmp6) {
7341   Label L_processPartitions;
7342   Label L_processPartition;
7343   Label L_exit;
7344 
7345   bind(L_processPartitions);
7346   cmpl(in_out1, 3 * size);
7347   jcc(Assembler::less, L_exit);
7348     xorl(tmp1, tmp1);
7349     xorl(tmp2, tmp2);
7350     movq(tmp3, in_out2);
7351     addq(tmp3, size);
7352 
7353     bind(L_processPartition);
7354       crc32(in_out3, Address(in_out2, 0), 8);
7355       crc32(tmp1, Address(in_out2, size), 8);
7356       crc32(tmp2, Address(in_out2, size * 2), 8);
7357       addq(in_out2, 8);
7358       cmpq(in_out2, tmp3);
7359       jcc(Assembler::less, L_processPartition);
7360     crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
7361             w_xtmp1, w_xtmp2, w_xtmp3,
7362             tmp4, tmp5,
7363             n_tmp6);
7364     addq(in_out2, 2 * size);
7365     subl(in_out1, 3 * size);
7366     jmp(L_processPartitions);
7367 
7368   bind(L_exit);
7369 }
7370 #else
7371 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
7372                                      Register tmp1, Register tmp2, Register tmp3,
7373                                      XMMRegister xtmp1, XMMRegister xtmp2) {
7374   lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
7375   if (n > 0) {
7376     addl(tmp3, n * 256 * 8);
7377   }
7378   //    Q1 = TABLEExt[n][B & 0xFF];
7379   movl(tmp1, in_out);
7380   andl(tmp1, 0x000000FF);
7381   shll(tmp1, 3);
7382   addl(tmp1, tmp3);
7383   movq(xtmp1, Address(tmp1, 0));
7384 
7385   //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
7386   movl(tmp2, in_out);
7387   shrl(tmp2, 8);
7388   andl(tmp2, 0x000000FF);
7389   shll(tmp2, 3);
7390   addl(tmp2, tmp3);
7391   movq(xtmp2, Address(tmp2, 0));
7392 
7393   psllq(xtmp2, 8);
7394   pxor(xtmp1, xtmp2);
7395 
7396   //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
7397   movl(tmp2, in_out);
7398   shrl(tmp2, 16);
7399   andl(tmp2, 0x000000FF);
7400   shll(tmp2, 3);
7401   addl(tmp2, tmp3);
7402   movq(xtmp2, Address(tmp2, 0));
7403 
7404   psllq(xtmp2, 16);
7405   pxor(xtmp1, xtmp2);
7406 
7407   //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
7408   shrl(in_out, 24);
7409   andl(in_out, 0x000000FF);
7410   shll(in_out, 3);
7411   addl(in_out, tmp3);
7412   movq(xtmp2, Address(in_out, 0));
7413 
7414   psllq(xtmp2, 24);
7415   pxor(xtmp1, xtmp2); // Result in CXMM
7416   //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
7417 }
7418 
7419 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
7420                                       Register in_out,
7421                                       uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
7422                                       XMMRegister w_xtmp2,
7423                                       Register tmp1,
7424                                       Register n_tmp2, Register n_tmp3) {
7425   if (is_pclmulqdq_supported) {
7426     movdl(w_xtmp1, in_out);
7427 
7428     movl(tmp1, const_or_pre_comp_const_index);
7429     movdl(w_xtmp2, tmp1);
7430     pclmulqdq(w_xtmp1, w_xtmp2, 0);
7431     // Keep result in XMM since GPR is 32 bit in length
7432   } else {
7433     crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
7434   }
7435 }
7436 
7437 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
7438                                      XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7439                                      Register tmp1, Register tmp2,
7440                                      Register n_tmp3) {
7441   crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7442   crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
7443 
7444   psllq(w_xtmp1, 1);
7445   movdl(tmp1, w_xtmp1);
7446   psrlq(w_xtmp1, 32);
7447   movdl(in_out, w_xtmp1);
7448 
7449   xorl(tmp2, tmp2);
7450   crc32(tmp2, tmp1, 4);
7451   xorl(in_out, tmp2);
7452 
7453   psllq(w_xtmp2, 1);
7454   movdl(tmp1, w_xtmp2);
7455   psrlq(w_xtmp2, 32);
7456   movdl(in1, w_xtmp2);
7457 
7458   xorl(tmp2, tmp2);
7459   crc32(tmp2, tmp1, 4);
7460   xorl(in1, tmp2);
7461   xorl(in_out, in1);
7462   xorl(in_out, in2);
7463 }
7464 
7465 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
7466                                        Register in_out1, Register in_out2, Register in_out3,
7467                                        Register tmp1, Register tmp2, Register tmp3,
7468                                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7469                                        Register tmp4, Register tmp5,
7470                                        Register n_tmp6) {
7471   Label L_processPartitions;
7472   Label L_processPartition;
7473   Label L_exit;
7474 
7475   bind(L_processPartitions);
7476   cmpl(in_out1, 3 * size);
7477   jcc(Assembler::less, L_exit);
7478     xorl(tmp1, tmp1);
7479     xorl(tmp2, tmp2);
7480     movl(tmp3, in_out2);
7481     addl(tmp3, size);
7482 
7483     bind(L_processPartition);
7484       crc32(in_out3, Address(in_out2, 0), 4);
7485       crc32(tmp1, Address(in_out2, size), 4);
7486       crc32(tmp2, Address(in_out2, size*2), 4);
7487       crc32(in_out3, Address(in_out2, 0+4), 4);
7488       crc32(tmp1, Address(in_out2, size+4), 4);
7489       crc32(tmp2, Address(in_out2, size*2+4), 4);
7490       addl(in_out2, 8);
7491       cmpl(in_out2, tmp3);
7492       jcc(Assembler::less, L_processPartition);
7493 
7494         push(tmp3);
7495         push(in_out1);
7496         push(in_out2);
7497         tmp4 = tmp3;
7498         tmp5 = in_out1;
7499         n_tmp6 = in_out2;
7500 
7501       crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
7502             w_xtmp1, w_xtmp2, w_xtmp3,
7503             tmp4, tmp5,
7504             n_tmp6);
7505 
7506         pop(in_out2);
7507         pop(in_out1);
7508         pop(tmp3);
7509 
7510     addl(in_out2, 2 * size);
7511     subl(in_out1, 3 * size);
7512     jmp(L_processPartitions);
7513 
7514   bind(L_exit);
7515 }
7516 #endif //LP64
7517 
7518 #ifdef _LP64
7519 // Algorithm 2: Pipelined usage of the CRC32 instruction.
7520 // Input: A buffer I of L bytes.
7521 // Output: the CRC32C value of the buffer.
7522 // Notations:
7523 // Write L = 24N + r, with N = floor (L/24).
7524 // r = L mod 24 (0 <= r < 24).
7525 // Consider I as the concatenation of A|B|C|R, where A, B, C, each,
7526 // N quadwords, and R consists of r bytes.
7527 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
7528 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
7529 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
7530 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
7531 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
7532                                           Register tmp1, Register tmp2, Register tmp3,
7533                                           Register tmp4, Register tmp5, Register tmp6,
7534                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7535                                           bool is_pclmulqdq_supported) {
7536   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
7537   Label L_wordByWord;
7538   Label L_byteByByteProlog;
7539   Label L_byteByByte;
7540   Label L_exit;
7541 
7542   if (is_pclmulqdq_supported ) {
7543     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
7544     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
7545 
7546     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
7547     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
7548 
7549     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
7550     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
7551     assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
7552   } else {
7553     const_or_pre_comp_const_index[0] = 1;
7554     const_or_pre_comp_const_index[1] = 0;
7555 
7556     const_or_pre_comp_const_index[2] = 3;
7557     const_or_pre_comp_const_index[3] = 2;
7558 
7559     const_or_pre_comp_const_index[4] = 5;
7560     const_or_pre_comp_const_index[5] = 4;
7561    }
7562   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
7563                     in2, in1, in_out,
7564                     tmp1, tmp2, tmp3,
7565                     w_xtmp1, w_xtmp2, w_xtmp3,
7566                     tmp4, tmp5,
7567                     tmp6);
7568   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
7569                     in2, in1, in_out,
7570                     tmp1, tmp2, tmp3,
7571                     w_xtmp1, w_xtmp2, w_xtmp3,
7572                     tmp4, tmp5,
7573                     tmp6);
7574   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
7575                     in2, in1, in_out,
7576                     tmp1, tmp2, tmp3,
7577                     w_xtmp1, w_xtmp2, w_xtmp3,
7578                     tmp4, tmp5,
7579                     tmp6);
7580   movl(tmp1, in2);
7581   andl(tmp1, 0x00000007);
7582   negl(tmp1);
7583   addl(tmp1, in2);
7584   addq(tmp1, in1);
7585 
7586   BIND(L_wordByWord);
7587   cmpq(in1, tmp1);
7588   jcc(Assembler::greaterEqual, L_byteByByteProlog);
7589     crc32(in_out, Address(in1, 0), 4);
7590     addq(in1, 4);
7591     jmp(L_wordByWord);
7592 
7593   BIND(L_byteByByteProlog);
7594   andl(in2, 0x00000007);
7595   movl(tmp2, 1);
7596 
7597   BIND(L_byteByByte);
7598   cmpl(tmp2, in2);
7599   jccb(Assembler::greater, L_exit);
7600     crc32(in_out, Address(in1, 0), 1);
7601     incq(in1);
7602     incl(tmp2);
7603     jmp(L_byteByByte);
7604 
7605   BIND(L_exit);
7606 }
7607 #else
7608 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
7609                                           Register tmp1, Register  tmp2, Register tmp3,
7610                                           Register tmp4, Register  tmp5, Register tmp6,
7611                                           XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
7612                                           bool is_pclmulqdq_supported) {
7613   uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
7614   Label L_wordByWord;
7615   Label L_byteByByteProlog;
7616   Label L_byteByByte;
7617   Label L_exit;
7618 
7619   if (is_pclmulqdq_supported) {
7620     const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
7621     const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
7622 
7623     const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
7624     const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
7625 
7626     const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
7627     const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
7628   } else {
7629     const_or_pre_comp_const_index[0] = 1;
7630     const_or_pre_comp_const_index[1] = 0;
7631 
7632     const_or_pre_comp_const_index[2] = 3;
7633     const_or_pre_comp_const_index[3] = 2;
7634 
7635     const_or_pre_comp_const_index[4] = 5;
7636     const_or_pre_comp_const_index[5] = 4;
7637   }
7638   crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
7639                     in2, in1, in_out,
7640                     tmp1, tmp2, tmp3,
7641                     w_xtmp1, w_xtmp2, w_xtmp3,
7642                     tmp4, tmp5,
7643                     tmp6);
7644   crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
7645                     in2, in1, in_out,
7646                     tmp1, tmp2, tmp3,
7647                     w_xtmp1, w_xtmp2, w_xtmp3,
7648                     tmp4, tmp5,
7649                     tmp6);
7650   crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
7651                     in2, in1, in_out,
7652                     tmp1, tmp2, tmp3,
7653                     w_xtmp1, w_xtmp2, w_xtmp3,
7654                     tmp4, tmp5,
7655                     tmp6);
7656   movl(tmp1, in2);
7657   andl(tmp1, 0x00000007);
7658   negl(tmp1);
7659   addl(tmp1, in2);
7660   addl(tmp1, in1);
7661 
7662   BIND(L_wordByWord);
7663   cmpl(in1, tmp1);
7664   jcc(Assembler::greaterEqual, L_byteByByteProlog);
7665     crc32(in_out, Address(in1,0), 4);
7666     addl(in1, 4);
7667     jmp(L_wordByWord);
7668 
7669   BIND(L_byteByByteProlog);
7670   andl(in2, 0x00000007);
7671   movl(tmp2, 1);
7672 
7673   BIND(L_byteByByte);
7674   cmpl(tmp2, in2);
7675   jccb(Assembler::greater, L_exit);
7676     movb(tmp1, Address(in1, 0));
7677     crc32(in_out, tmp1, 1);
7678     incl(in1);
7679     incl(tmp2);
7680     jmp(L_byteByByte);
7681 
7682   BIND(L_exit);
7683 }
7684 #endif // LP64
7685 #undef BIND
7686 #undef BLOCK_COMMENT
7687 
7688 // Compress char[] array to byte[].
7689 //   ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
7690 //   @HotSpotIntrinsicCandidate
7691 //   private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
7692 //     for (int i = 0; i < len; i++) {
7693 //       int c = src[srcOff++];
7694 //       if (c >>> 8 != 0) {
7695 //         return 0;
7696 //       }
7697 //       dst[dstOff++] = (byte)c;
7698 //     }
7699 //     return len;
7700 //   }
7701 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
7702   XMMRegister tmp1Reg, XMMRegister tmp2Reg,
7703   XMMRegister tmp3Reg, XMMRegister tmp4Reg,
7704   Register tmp5, Register result) {
7705   Label copy_chars_loop, return_length, return_zero, done;
7706 
7707   // rsi: src
7708   // rdi: dst
7709   // rdx: len
7710   // rcx: tmp5
7711   // rax: result
7712 
7713   // rsi holds start addr of source char[] to be compressed
7714   // rdi holds start addr of destination byte[]
7715   // rdx holds length
7716 
7717   assert(len != result, "");
7718 
7719   // save length for return
7720   push(len);
7721 
7722   if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
7723     VM_Version::supports_avx512vlbw() &&
7724     VM_Version::supports_bmi2()) {
7725 
7726     Label copy_32_loop, copy_loop_tail, below_threshold;
7727 
7728     // alignment
7729     Label post_alignment;
7730 
7731     // if length of the string is less than 16, handle it in an old fashioned way
7732     testl(len, -32);
7733     jcc(Assembler::zero, below_threshold);
7734 
7735     // First check whether a character is compressable ( <= 0xFF).
7736     // Create mask to test for Unicode chars inside zmm vector
7737     movl(result, 0x00FF);
7738     evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
7739 
7740     testl(len, -64);
7741     jcc(Assembler::zero, post_alignment);
7742 
7743     movl(tmp5, dst);
7744     andl(tmp5, (32 - 1));
7745     negl(tmp5);
7746     andl(tmp5, (32 - 1));
7747 
7748     // bail out when there is nothing to be done
7749     testl(tmp5, 0xFFFFFFFF);
7750     jcc(Assembler::zero, post_alignment);
7751 
7752     // ~(~0 << len), where len is the # of remaining elements to process
7753     movl(result, 0xFFFFFFFF);
7754     shlxl(result, result, tmp5);
7755     notl(result);
7756     kmovdl(k3, result);
7757 
7758     evmovdquw(tmp1Reg, k3, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
7759     evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
7760     ktestd(k2, k3);
7761     jcc(Assembler::carryClear, return_zero);
7762 
7763     evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit);
7764 
7765     addptr(src, tmp5);
7766     addptr(src, tmp5);
7767     addptr(dst, tmp5);
7768     subl(len, tmp5);
7769 
7770     bind(post_alignment);
7771     // end of alignment
7772 
7773     movl(tmp5, len);
7774     andl(tmp5, (32 - 1));    // tail count (in chars)
7775     andl(len, ~(32 - 1));    // vector count (in chars)
7776     jcc(Assembler::zero, copy_loop_tail);
7777 
7778     lea(src, Address(src, len, Address::times_2));
7779     lea(dst, Address(dst, len, Address::times_1));
7780     negptr(len);
7781 
7782     bind(copy_32_loop);
7783     evmovdquw(tmp1Reg, Address(src, len, Address::times_2), /*merge*/ false, Assembler::AVX_512bit);
7784     evpcmpuw(k2, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
7785     kortestdl(k2, k2);
7786     jcc(Assembler::carryClear, return_zero);
7787 
7788     // All elements in current processed chunk are valid candidates for
7789     // compression. Write a truncated byte elements to the memory.
7790     evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
7791     addptr(len, 32);
7792     jcc(Assembler::notZero, copy_32_loop);
7793 
7794     bind(copy_loop_tail);
7795     // bail out when there is nothing to be done
7796     testl(tmp5, 0xFFFFFFFF);
7797     jcc(Assembler::zero, return_length);
7798 
7799     movl(len, tmp5);
7800 
7801     // ~(~0 << len), where len is the # of remaining elements to process
7802     movl(result, 0xFFFFFFFF);
7803     shlxl(result, result, len);
7804     notl(result);
7805 
7806     kmovdl(k3, result);
7807 
7808     evmovdquw(tmp1Reg, k3, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
7809     evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
7810     ktestd(k2, k3);
7811     jcc(Assembler::carryClear, return_zero);
7812 
7813     evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit);
7814     jmp(return_length);
7815 
7816     bind(below_threshold);
7817   }
7818 
7819   if (UseSSE42Intrinsics) {
7820     Label copy_32_loop, copy_16, copy_tail;
7821 
7822     movl(result, len);
7823 
7824     movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
7825 
7826     // vectored compression
7827     andl(len, 0xfffffff0);    // vector count (in chars)
7828     andl(result, 0x0000000f);    // tail count (in chars)
7829     testl(len, len);
7830     jcc(Assembler::zero, copy_16);
7831 
7832     // compress 16 chars per iter
7833     movdl(tmp1Reg, tmp5);
7834     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
7835     pxor(tmp4Reg, tmp4Reg);
7836 
7837     lea(src, Address(src, len, Address::times_2));
7838     lea(dst, Address(dst, len, Address::times_1));
7839     negptr(len);
7840 
7841     bind(copy_32_loop);
7842     movdqu(tmp2Reg, Address(src, len, Address::times_2));     // load 1st 8 characters
7843     por(tmp4Reg, tmp2Reg);
7844     movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
7845     por(tmp4Reg, tmp3Reg);
7846     ptest(tmp4Reg, tmp1Reg);       // check for Unicode chars in next vector
7847     jcc(Assembler::notZero, return_zero);
7848     packuswb(tmp2Reg, tmp3Reg);    // only ASCII chars; compress each to 1 byte
7849     movdqu(Address(dst, len, Address::times_1), tmp2Reg);
7850     addptr(len, 16);
7851     jcc(Assembler::notZero, copy_32_loop);
7852 
7853     // compress next vector of 8 chars (if any)
7854     bind(copy_16);
7855     movl(len, result);
7856     andl(len, 0xfffffff8);    // vector count (in chars)
7857     andl(result, 0x00000007);    // tail count (in chars)
7858     testl(len, len);
7859     jccb(Assembler::zero, copy_tail);
7860 
7861     movdl(tmp1Reg, tmp5);
7862     pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
7863     pxor(tmp3Reg, tmp3Reg);
7864 
7865     movdqu(tmp2Reg, Address(src, 0));
7866     ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in vector
7867     jccb(Assembler::notZero, return_zero);
7868     packuswb(tmp2Reg, tmp3Reg);    // only LATIN1 chars; compress each to 1 byte
7869     movq(Address(dst, 0), tmp2Reg);
7870     addptr(src, 16);
7871     addptr(dst, 8);
7872 
7873     bind(copy_tail);
7874     movl(len, result);
7875   }
7876   // compress 1 char per iter
7877   testl(len, len);
7878   jccb(Assembler::zero, return_length);
7879   lea(src, Address(src, len, Address::times_2));
7880   lea(dst, Address(dst, len, Address::times_1));
7881   negptr(len);
7882 
7883   bind(copy_chars_loop);
7884   load_unsigned_short(result, Address(src, len, Address::times_2));
7885   testl(result, 0xff00);      // check if Unicode char
7886   jccb(Assembler::notZero, return_zero);
7887   movb(Address(dst, len, Address::times_1), result);  // ASCII char; compress to 1 byte
7888   increment(len);
7889   jcc(Assembler::notZero, copy_chars_loop);
7890 
7891   // if compression succeeded, return length
7892   bind(return_length);
7893   pop(result);
7894   jmpb(done);
7895 
7896   // if compression failed, return 0
7897   bind(return_zero);
7898   xorl(result, result);
7899   addptr(rsp, wordSize);
7900 
7901   bind(done);
7902 }
7903 
7904 // Inflate byte[] array to char[].
7905 //   ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
7906 //   @HotSpotIntrinsicCandidate
7907 //   private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
7908 //     for (int i = 0; i < len; i++) {
7909 //       dst[dstOff++] = (char)(src[srcOff++] & 0xff);
7910 //     }
7911 //   }
7912 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
7913   XMMRegister tmp1, Register tmp2) {
7914   Label copy_chars_loop, done, below_threshold, avx3_threshold;
7915   // rsi: src
7916   // rdi: dst
7917   // rdx: len
7918   // rcx: tmp2
7919 
7920   // rsi holds start addr of source byte[] to be inflated
7921   // rdi holds start addr of destination char[]
7922   // rdx holds length
7923   assert_different_registers(src, dst, len, tmp2);
7924   movl(tmp2, len);
7925   if ((UseAVX > 2) && // AVX512
7926     VM_Version::supports_avx512vlbw() &&
7927     VM_Version::supports_bmi2()) {
7928 
7929     Label copy_32_loop, copy_tail;
7930     Register tmp3_aliased = len;
7931 
7932     // if length of the string is less than 16, handle it in an old fashioned way
7933     testl(len, -16);
7934     jcc(Assembler::zero, below_threshold);
7935 
7936     testl(len, -1 * AVX3Threshold);
7937     jcc(Assembler::zero, avx3_threshold);
7938 
7939     // In order to use only one arithmetic operation for the main loop we use
7940     // this pre-calculation
7941     andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
7942     andl(len, -32);     // vector count
7943     jccb(Assembler::zero, copy_tail);
7944 
7945     lea(src, Address(src, len, Address::times_1));
7946     lea(dst, Address(dst, len, Address::times_2));
7947     negptr(len);
7948 
7949 
7950     // inflate 32 chars per iter
7951     bind(copy_32_loop);
7952     vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
7953     evmovdquw(Address(dst, len, Address::times_2), tmp1, /*merge*/ false, Assembler::AVX_512bit);
7954     addptr(len, 32);
7955     jcc(Assembler::notZero, copy_32_loop);
7956 
7957     bind(copy_tail);
7958     // bail out when there is nothing to be done
7959     testl(tmp2, -1); // we don't destroy the contents of tmp2 here
7960     jcc(Assembler::zero, done);
7961 
7962     // ~(~0 << length), where length is the # of remaining elements to process
7963     movl(tmp3_aliased, -1);
7964     shlxl(tmp3_aliased, tmp3_aliased, tmp2);
7965     notl(tmp3_aliased);
7966     kmovdl(k2, tmp3_aliased);
7967     evpmovzxbw(tmp1, k2, Address(src, 0), Assembler::AVX_512bit);
7968     evmovdquw(Address(dst, 0), k2, tmp1, /*merge*/ true, Assembler::AVX_512bit);
7969 
7970     jmp(done);
7971     bind(avx3_threshold);
7972   }
7973   if (UseSSE42Intrinsics) {
7974     Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
7975 
7976     if (UseAVX > 1) {
7977       andl(tmp2, (16 - 1));
7978       andl(len, -16);
7979       jccb(Assembler::zero, copy_new_tail);
7980     } else {
7981       andl(tmp2, 0x00000007);   // tail count (in chars)
7982       andl(len, 0xfffffff8);    // vector count (in chars)
7983       jccb(Assembler::zero, copy_tail);
7984     }
7985 
7986     // vectored inflation
7987     lea(src, Address(src, len, Address::times_1));
7988     lea(dst, Address(dst, len, Address::times_2));
7989     negptr(len);
7990 
7991     if (UseAVX > 1) {
7992       bind(copy_16_loop);
7993       vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
7994       vmovdqu(Address(dst, len, Address::times_2), tmp1);
7995       addptr(len, 16);
7996       jcc(Assembler::notZero, copy_16_loop);
7997 
7998       bind(below_threshold);
7999       bind(copy_new_tail);
8000       movl(len, tmp2);
8001       andl(tmp2, 0x00000007);
8002       andl(len, 0xFFFFFFF8);
8003       jccb(Assembler::zero, copy_tail);
8004 
8005       pmovzxbw(tmp1, Address(src, 0));
8006       movdqu(Address(dst, 0), tmp1);
8007       addptr(src, 8);
8008       addptr(dst, 2 * 8);
8009 
8010       jmp(copy_tail, true);
8011     }
8012 
8013     // inflate 8 chars per iter
8014     bind(copy_8_loop);
8015     pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
8016     movdqu(Address(dst, len, Address::times_2), tmp1);
8017     addptr(len, 8);
8018     jcc(Assembler::notZero, copy_8_loop);
8019 
8020     bind(copy_tail);
8021     movl(len, tmp2);
8022 
8023     cmpl(len, 4);
8024     jccb(Assembler::less, copy_bytes);
8025 
8026     movdl(tmp1, Address(src, 0));  // load 4 byte chars
8027     pmovzxbw(tmp1, tmp1);
8028     movq(Address(dst, 0), tmp1);
8029     subptr(len, 4);
8030     addptr(src, 4);
8031     addptr(dst, 8);
8032 
8033     bind(copy_bytes);
8034   } else {
8035     bind(below_threshold);
8036   }
8037 
8038   testl(len, len);
8039   jccb(Assembler::zero, done);
8040   lea(src, Address(src, len, Address::times_1));
8041   lea(dst, Address(dst, len, Address::times_2));
8042   negptr(len);
8043 
8044   // inflate 1 char per iter
8045   bind(copy_chars_loop);
8046   load_unsigned_byte(tmp2, Address(src, len, Address::times_1));  // load byte char
8047   movw(Address(dst, len, Address::times_2), tmp2);  // inflate byte char to word
8048   increment(len);
8049   jcc(Assembler::notZero, copy_chars_loop);
8050 
8051   bind(done);
8052 }
8053 
8054 #ifdef _LP64
8055 void MacroAssembler::convert_f2i(Register dst, XMMRegister src) {
8056   Label done;
8057   cvttss2sil(dst, src);
8058   // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
8059   cmpl(dst, 0x80000000); // float_sign_flip
8060   jccb(Assembler::notEqual, done);
8061   subptr(rsp, 8);
8062   movflt(Address(rsp, 0), src);
8063   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup())));
8064   pop(dst);
8065   bind(done);
8066 }
8067 
8068 void MacroAssembler::convert_d2i(Register dst, XMMRegister src) {
8069   Label done;
8070   cvttsd2sil(dst, src);
8071   // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub
8072   cmpl(dst, 0x80000000); // float_sign_flip
8073   jccb(Assembler::notEqual, done);
8074   subptr(rsp, 8);
8075   movdbl(Address(rsp, 0), src);
8076   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup())));
8077   pop(dst);
8078   bind(done);
8079 }
8080 
8081 void MacroAssembler::convert_f2l(Register dst, XMMRegister src) {
8082   Label done;
8083   cvttss2siq(dst, src);
8084   cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
8085   jccb(Assembler::notEqual, done);
8086   subptr(rsp, 8);
8087   movflt(Address(rsp, 0), src);
8088   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup())));
8089   pop(dst);
8090   bind(done);
8091 }
8092 
8093 void MacroAssembler::convert_d2l(Register dst, XMMRegister src) {
8094   Label done;
8095   cvttsd2siq(dst, src);
8096   cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip()));
8097   jccb(Assembler::notEqual, done);
8098   subptr(rsp, 8);
8099   movdbl(Address(rsp, 0), src);
8100   call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup())));
8101   pop(dst);
8102   bind(done);
8103 }
8104 
8105 void MacroAssembler::cache_wb(Address line)
8106 {
8107   // 64 bit cpus always support clflush
8108   assert(VM_Version::supports_clflush(), "clflush should be available");
8109   bool optimized = VM_Version::supports_clflushopt();
8110   bool no_evict = VM_Version::supports_clwb();
8111 
8112   // prefer clwb (writeback without evict) otherwise
8113   // prefer clflushopt (potentially parallel writeback with evict)
8114   // otherwise fallback on clflush (serial writeback with evict)
8115 
8116   if (optimized) {
8117     if (no_evict) {
8118       clwb(line);
8119     } else {
8120       clflushopt(line);
8121     }
8122   } else {
8123     // no need for fence when using CLFLUSH
8124     clflush(line);
8125   }
8126 }
8127 
8128 void MacroAssembler::cache_wbsync(bool is_pre)
8129 {
8130   assert(VM_Version::supports_clflush(), "clflush should be available");
8131   bool optimized = VM_Version::supports_clflushopt();
8132   bool no_evict = VM_Version::supports_clwb();
8133 
8134   // pick the correct implementation
8135 
8136   if (!is_pre && (optimized || no_evict)) {
8137     // need an sfence for post flush when using clflushopt or clwb
8138     // otherwise no no need for any synchroniaztion
8139 
8140     sfence();
8141   }
8142 }
8143 #endif // _LP64
8144 
8145 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
8146   switch (cond) {
8147     // Note some conditions are synonyms for others
8148     case Assembler::zero:         return Assembler::notZero;
8149     case Assembler::notZero:      return Assembler::zero;
8150     case Assembler::less:         return Assembler::greaterEqual;
8151     case Assembler::lessEqual:    return Assembler::greater;
8152     case Assembler::greater:      return Assembler::lessEqual;
8153     case Assembler::greaterEqual: return Assembler::less;
8154     case Assembler::below:        return Assembler::aboveEqual;
8155     case Assembler::belowEqual:   return Assembler::above;
8156     case Assembler::above:        return Assembler::belowEqual;
8157     case Assembler::aboveEqual:   return Assembler::below;
8158     case Assembler::overflow:     return Assembler::noOverflow;
8159     case Assembler::noOverflow:   return Assembler::overflow;
8160     case Assembler::negative:     return Assembler::positive;
8161     case Assembler::positive:     return Assembler::negative;
8162     case Assembler::parity:       return Assembler::noParity;
8163     case Assembler::noParity:     return Assembler::parity;
8164   }
8165   ShouldNotReachHere(); return Assembler::overflow;
8166 }
8167 
8168 SkipIfEqual::SkipIfEqual(
8169     MacroAssembler* masm, const bool* flag_addr, bool value) {
8170   _masm = masm;
8171   _masm->cmp8(ExternalAddress((address)flag_addr), value);
8172   _masm->jcc(Assembler::equal, _label);
8173 }
8174 
8175 SkipIfEqual::~SkipIfEqual() {
8176   _masm->bind(_label);
8177 }
8178 
8179 // 32-bit Windows has its own fast-path implementation
8180 // of get_thread
8181 #if !defined(WIN32) || defined(_LP64)
8182 
8183 // This is simply a call to Thread::current()
8184 void MacroAssembler::get_thread(Register thread) {
8185   if (thread != rax) {
8186     push(rax);
8187   }
8188   LP64_ONLY(push(rdi);)
8189   LP64_ONLY(push(rsi);)
8190   push(rdx);
8191   push(rcx);
8192 #ifdef _LP64
8193   push(r8);
8194   push(r9);
8195   push(r10);
8196   push(r11);
8197 #endif
8198 
8199   MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
8200 
8201 #ifdef _LP64
8202   pop(r11);
8203   pop(r10);
8204   pop(r9);
8205   pop(r8);
8206 #endif
8207   pop(rcx);
8208   pop(rdx);
8209   LP64_ONLY(pop(rsi);)
8210   LP64_ONLY(pop(rdi);)
8211   if (thread != rax) {
8212     mov(thread, rax);
8213     pop(rax);
8214   }
8215 }
8216 
8217 #endif // !WIN32 || _LP64