1 /*
   2  * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef CPU_X86_MACROASSEMBLER_X86_HPP
  26 #define CPU_X86_MACROASSEMBLER_X86_HPP
  27 
  28 #include "asm/assembler.hpp"
  29 #include "utilities/macros.hpp"
  30 #include "runtime/rtmLocking.hpp"
  31 #include "runtime/signature.hpp"
  32 
  33 class ciValueKlass;
  34 
  35 // MacroAssembler extends Assembler by frequently used macros.
  36 //
  37 // Instructions for which a 'better' code sequence exists depending
  38 // on arguments should also go in here.
  39 
  40 class MacroAssembler: public Assembler {
  41   friend class LIR_Assembler;
  42   friend class Runtime1;      // as_Address()
  43 
  44  public:
  45   // Support for VM calls
  46   //
  47   // This is the base routine called by the different versions of call_VM_leaf. The interpreter
  48   // may customize this version by overriding it for its purposes (e.g., to save/restore
  49   // additional registers when doing a VM call).
  50 
  51   virtual void call_VM_leaf_base(
  52     address entry_point,               // the entry point
  53     int     number_of_arguments        // the number of arguments to pop after the call
  54   );
  55 
  56  protected:
  57   // This is the base routine called by the different versions of call_VM. The interpreter
  58   // may customize this version by overriding it for its purposes (e.g., to save/restore
  59   // additional registers when doing a VM call).
  60   //
  61   // If no java_thread register is specified (noreg) than rdi will be used instead. call_VM_base
  62   // returns the register which contains the thread upon return. If a thread register has been
  63   // specified, the return value will correspond to that register. If no last_java_sp is specified
  64   // (noreg) than rsp will be used instead.
  65   virtual void call_VM_base(           // returns the register containing the thread upon return
  66     Register oop_result,               // where an oop-result ends up if any; use noreg otherwise
  67     Register java_thread,              // the thread if computed before     ; use noreg otherwise
  68     Register last_java_sp,             // to set up last_Java_frame in stubs; use noreg otherwise
  69     address  entry_point,              // the entry point
  70     int      number_of_arguments,      // the number of arguments (w/o thread) to pop after the call
  71     bool     check_exceptions          // whether to check for pending exceptions after return
  72   );
  73 
  74   void call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions = true);
  75 
  76   // helpers for FPU flag access
  77   // tmp is a temporary register, if none is available use noreg
  78   void save_rax   (Register tmp);
  79   void restore_rax(Register tmp);
  80 
  81  public:
  82   MacroAssembler(CodeBuffer* code) : Assembler(code) {}
  83 
  84  // These routines should emit JVMTI PopFrame and ForceEarlyReturn handling code.
  85  // The implementation is only non-empty for the InterpreterMacroAssembler,
  86  // as only the interpreter handles PopFrame and ForceEarlyReturn requests.
  87  virtual void check_and_handle_popframe(Register java_thread);
  88  virtual void check_and_handle_earlyret(Register java_thread);
  89 
  90   Address as_Address(AddressLiteral adr);
  91   Address as_Address(ArrayAddress adr);
  92 
  93   // Support for NULL-checks
  94   //
  95   // Generates code that causes a NULL OS exception if the content of reg is NULL.
  96   // If the accessed location is M[reg + offset] and the offset is known, provide the
  97   // offset. No explicit code generation is needed if the offset is within a certain
  98   // range (0 <= offset <= page_size).
  99 
 100   void null_check(Register reg, int offset = -1);
 101   static bool needs_explicit_null_check(intptr_t offset);
 102   static bool uses_implicit_null_check(void* address);
 103 
 104   // valueKlass queries, kills temp_reg
 105   void test_klass_is_value(Register klass, Register temp_reg, Label& is_value);
 106   void test_klass_is_empty_value(Register klass, Register temp_reg, Label& is_empty_value);
 107 
 108   // Get the default value oop for the given ValueKlass
 109   void get_default_value_oop(Register value_klass, Register temp_reg, Register obj);
 110   // The empty value oop, for the given ValueKlass ("empty" as in no instance fields)
 111   // get_default_value_oop with extra assertion for empty value klass
 112   void get_empty_value_oop(Register value_klass, Register temp_reg, Register obj);
 113 
 114   void test_field_is_flattenable(Register flags, Register temp_reg, Label& is_flattenable);
 115   void test_field_is_not_flattenable(Register flags, Register temp_reg, Label& notFlattenable);
 116   void test_field_is_flattened(Register flags, Register temp_reg, Label& is_flattened);
 117 
 118   // Check oops array storage properties, i.e. flattened and/or null-free
 119   void test_flattened_array_oop(Register oop, Register temp_reg, Label&is_flattened_array);
 120   void test_non_flattened_array_oop(Register oop, Register temp_reg, Label&is_non_flattened_array);
 121   void test_null_free_array_oop(Register oop, Register temp_reg, Label&is_null_free_array);
 122   void test_non_null_free_array_oop(Register oop, Register temp_reg, Label&is_non_null_free_array);
 123 
 124   // Required platform-specific helpers for Label::patch_instructions.
 125   // They _shadow_ the declarations in AbstractAssembler, which are undefined.
 126   void pd_patch_instruction(address branch, address target, const char* file, int line) {
 127     unsigned char op = branch[0];
 128     assert(op == 0xE8 /* call */ ||
 129         op == 0xE9 /* jmp */ ||
 130         op == 0xEB /* short jmp */ ||
 131         (op & 0xF0) == 0x70 /* short jcc */ ||
 132         op == 0x0F && (branch[1] & 0xF0) == 0x80 /* jcc */ ||
 133         op == 0xC7 && branch[1] == 0xF8 /* xbegin */,
 134         "Invalid opcode at patch point");
 135 
 136     if (op == 0xEB || (op & 0xF0) == 0x70) {
 137       // short offset operators (jmp and jcc)
 138       char* disp = (char*) &branch[1];
 139       int imm8 = target - (address) &disp[1];
 140       guarantee(this->is8bit(imm8), "Short forward jump exceeds 8-bit offset at %s:%d",
 141                 file == NULL ? "<NULL>" : file, line);
 142       *disp = imm8;
 143     } else {
 144       int* disp = (int*) &branch[(op == 0x0F || op == 0xC7)? 2: 1];
 145       int imm32 = target - (address) &disp[1];
 146       *disp = imm32;
 147     }
 148   }
 149 
 150   // The following 4 methods return the offset of the appropriate move instruction
 151 
 152   // Support for fast byte/short loading with zero extension (depending on particular CPU)
 153   int load_unsigned_byte(Register dst, Address src);
 154   int load_unsigned_short(Register dst, Address src);
 155 
 156   // Support for fast byte/short loading with sign extension (depending on particular CPU)
 157   int load_signed_byte(Register dst, Address src);
 158   int load_signed_short(Register dst, Address src);
 159 
 160   // Support for sign-extension (hi:lo = extend_sign(lo))
 161   void extend_sign(Register hi, Register lo);
 162 
 163   // Load and store values by size and signed-ness
 164   void load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2 = noreg);
 165   void store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2 = noreg);
 166 
 167   // Support for inc/dec with optimal instruction selection depending on value
 168 
 169   void increment(Register reg, int value = 1) { LP64_ONLY(incrementq(reg, value)) NOT_LP64(incrementl(reg, value)) ; }
 170   void decrement(Register reg, int value = 1) { LP64_ONLY(decrementq(reg, value)) NOT_LP64(decrementl(reg, value)) ; }
 171 
 172   void decrementl(Address dst, int value = 1);
 173   void decrementl(Register reg, int value = 1);
 174 
 175   void decrementq(Register reg, int value = 1);
 176   void decrementq(Address dst, int value = 1);
 177 
 178   void incrementl(Address dst, int value = 1);
 179   void incrementl(Register reg, int value = 1);
 180 
 181   void incrementq(Register reg, int value = 1);
 182   void incrementq(Address dst, int value = 1);
 183 
 184 #ifdef COMPILER2
 185   // special instructions for EVEX
 186   void setvectmask(Register dst, Register src);
 187   void restorevectmask();
 188 #endif
 189 
 190   // Support optimal SSE move instructions.
 191   void movflt(XMMRegister dst, XMMRegister src) {
 192     if (dst-> encoding() == src->encoding()) return;
 193     if (UseXmmRegToRegMoveAll) { movaps(dst, src); return; }
 194     else                       { movss (dst, src); return; }
 195   }
 196   void movflt(XMMRegister dst, Address src) { movss(dst, src); }
 197   void movflt(XMMRegister dst, AddressLiteral src);
 198   void movflt(Address dst, XMMRegister src) { movss(dst, src); }
 199 
 200   void movdbl(XMMRegister dst, XMMRegister src) {
 201     if (dst-> encoding() == src->encoding()) return;
 202     if (UseXmmRegToRegMoveAll) { movapd(dst, src); return; }
 203     else                       { movsd (dst, src); return; }
 204   }
 205 
 206   void movdbl(XMMRegister dst, AddressLiteral src);
 207 
 208   void movdbl(XMMRegister dst, Address src) {
 209     if (UseXmmLoadAndClearUpper) { movsd (dst, src); return; }
 210     else                         { movlpd(dst, src); return; }
 211   }
 212   void movdbl(Address dst, XMMRegister src) { movsd(dst, src); }
 213 
 214   void incrementl(AddressLiteral dst);
 215   void incrementl(ArrayAddress dst);
 216 
 217   void incrementq(AddressLiteral dst);
 218 
 219   // Alignment
 220   void align(int modulus);
 221   void align(int modulus, int target);
 222 
 223   // A 5 byte nop that is safe for patching (see patch_verified_entry)
 224   void fat_nop();
 225 
 226   // Stack frame creation/removal
 227   void enter();
 228   void leave();
 229 
 230   // Support for getting the JavaThread pointer (i.e.; a reference to thread-local information)
 231   // The pointer will be loaded into the thread register.
 232   void get_thread(Register thread);
 233 
 234 
 235   // Support for VM calls
 236   //
 237   // It is imperative that all calls into the VM are handled via the call_VM macros.
 238   // They make sure that the stack linkage is setup correctly. call_VM's correspond
 239   // to ENTRY/ENTRY_X entry points while call_VM_leaf's correspond to LEAF entry points.
 240 
 241 
 242   void call_VM(Register oop_result,
 243                address entry_point,
 244                bool check_exceptions = true);
 245   void call_VM(Register oop_result,
 246                address entry_point,
 247                Register arg_1,
 248                bool check_exceptions = true);
 249   void call_VM(Register oop_result,
 250                address entry_point,
 251                Register arg_1, Register arg_2,
 252                bool check_exceptions = true);
 253   void call_VM(Register oop_result,
 254                address entry_point,
 255                Register arg_1, Register arg_2, Register arg_3,
 256                bool check_exceptions = true);
 257 
 258   // Overloadings with last_Java_sp
 259   void call_VM(Register oop_result,
 260                Register last_java_sp,
 261                address entry_point,
 262                int number_of_arguments = 0,
 263                bool check_exceptions = true);
 264   void call_VM(Register oop_result,
 265                Register last_java_sp,
 266                address entry_point,
 267                Register arg_1, bool
 268                check_exceptions = true);
 269   void call_VM(Register oop_result,
 270                Register last_java_sp,
 271                address entry_point,
 272                Register arg_1, Register arg_2,
 273                bool check_exceptions = true);
 274   void call_VM(Register oop_result,
 275                Register last_java_sp,
 276                address entry_point,
 277                Register arg_1, Register arg_2, Register arg_3,
 278                bool check_exceptions = true);
 279 
 280   void get_vm_result  (Register oop_result, Register thread);
 281   void get_vm_result_2(Register metadata_result, Register thread);
 282 
 283   // These always tightly bind to MacroAssembler::call_VM_base
 284   // bypassing the virtual implementation
 285   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, int number_of_arguments = 0, bool check_exceptions = true);
 286   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, bool check_exceptions = true);
 287   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, bool check_exceptions = true);
 288   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, bool check_exceptions = true);
 289   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4, bool check_exceptions = true);
 290 
 291   void call_VM_leaf0(address entry_point);
 292   void call_VM_leaf(address entry_point,
 293                     int number_of_arguments = 0);
 294   void call_VM_leaf(address entry_point,
 295                     Register arg_1);
 296   void call_VM_leaf(address entry_point,
 297                     Register arg_1, Register arg_2);
 298   void call_VM_leaf(address entry_point,
 299                     Register arg_1, Register arg_2, Register arg_3);
 300 
 301   // These always tightly bind to MacroAssembler::call_VM_leaf_base
 302   // bypassing the virtual implementation
 303   void super_call_VM_leaf(address entry_point);
 304   void super_call_VM_leaf(address entry_point, Register arg_1);
 305   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2);
 306   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3);
 307   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4);
 308 
 309   // last Java Frame (fills frame anchor)
 310   void set_last_Java_frame(Register thread,
 311                            Register last_java_sp,
 312                            Register last_java_fp,
 313                            address last_java_pc);
 314 
 315   // thread in the default location (r15_thread on 64bit)
 316   void set_last_Java_frame(Register last_java_sp,
 317                            Register last_java_fp,
 318                            address last_java_pc);
 319 
 320   void reset_last_Java_frame(Register thread, bool clear_fp);
 321 
 322   // thread in the default location (r15_thread on 64bit)
 323   void reset_last_Java_frame(bool clear_fp);
 324 
 325   // jobjects
 326   void clear_jweak_tag(Register possibly_jweak);
 327   void resolve_jobject(Register value, Register thread, Register tmp);
 328 
 329   // C 'boolean' to Java boolean: x == 0 ? 0 : 1
 330   void c2bool(Register x);
 331 
 332   // C++ bool manipulation
 333 
 334   void movbool(Register dst, Address src);
 335   void movbool(Address dst, bool boolconst);
 336   void movbool(Address dst, Register src);
 337   void testbool(Register dst);
 338 
 339   void resolve_oop_handle(Register result, Register tmp = rscratch2);
 340   void resolve_weak_handle(Register result, Register tmp);
 341   void load_mirror(Register mirror, Register method, Register tmp = rscratch2);
 342   void load_method_holder_cld(Register rresult, Register rmethod);
 343 
 344   void load_method_holder(Register holder, Register method);
 345 
 346   // oop manipulations
 347   void load_metadata(Register dst, Register src);
 348   void load_storage_props(Register dst, Register src);
 349   void load_klass(Register dst, Register src);
 350   void store_klass(Register dst, Register src);
 351 
 352   void access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
 353                       Register tmp1, Register thread_tmp);
 354   void access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src,
 355                        Register tmp1, Register tmp2, Register tmp3 = noreg);
 356 
 357   void access_value_copy(DecoratorSet decorators, Register src, Register dst, Register value_klass);
 358 
 359   // value type data payload offsets...
 360   void first_field_offset(Register value_klass, Register offset);
 361   void data_for_oop(Register oop, Register data, Register value_klass);
 362 
 363 
 364   // Resolves obj access. Result is placed in the same register.
 365   // All other registers are preserved.
 366   void resolve(DecoratorSet decorators, Register obj);
 367 
 368   void load_heap_oop(Register dst, Address src, Register tmp1 = noreg,
 369                      Register thread_tmp = noreg, DecoratorSet decorators = 0);
 370   void load_heap_oop_not_null(Register dst, Address src, Register tmp1 = noreg,
 371                               Register thread_tmp = noreg, DecoratorSet decorators = 0);
 372   void store_heap_oop(Address dst, Register src, Register tmp1 = noreg,
 373                       Register tmp2 = noreg, Register tmp3 = noreg, DecoratorSet decorators = 0);
 374 
 375   // Used for storing NULL. All other oop constants should be
 376   // stored using routines that take a jobject.
 377   void store_heap_oop_null(Address dst);
 378 
 379   void load_prototype_header(Register dst, Register src);
 380 
 381 #ifdef _LP64
 382   void store_klass_gap(Register dst, Register src);
 383 
 384   // This dummy is to prevent a call to store_heap_oop from
 385   // converting a zero (like NULL) into a Register by giving
 386   // the compiler two choices it can't resolve
 387 
 388   void store_heap_oop(Address dst, void* dummy);
 389 
 390   void encode_heap_oop(Register r);
 391   void decode_heap_oop(Register r);
 392   void encode_heap_oop_not_null(Register r);
 393   void decode_heap_oop_not_null(Register r);
 394   void encode_heap_oop_not_null(Register dst, Register src);
 395   void decode_heap_oop_not_null(Register dst, Register src);
 396 
 397   void set_narrow_oop(Register dst, jobject obj);
 398   void set_narrow_oop(Address dst, jobject obj);
 399   void cmp_narrow_oop(Register dst, jobject obj);
 400   void cmp_narrow_oop(Address dst, jobject obj);
 401 
 402   void encode_klass_not_null(Register r);
 403   void decode_klass_not_null(Register r);
 404   void encode_klass_not_null(Register dst, Register src);
 405   void decode_klass_not_null(Register dst, Register src);
 406   void set_narrow_klass(Register dst, Klass* k);
 407   void set_narrow_klass(Address dst, Klass* k);
 408   void cmp_narrow_klass(Register dst, Klass* k);
 409   void cmp_narrow_klass(Address dst, Klass* k);
 410 
 411   // Returns the byte size of the instructions generated by decode_klass_not_null()
 412   // when compressed klass pointers are being used.
 413   static int instr_size_for_decode_klass_not_null();
 414 
 415   // if heap base register is used - reinit it with the correct value
 416   void reinit_heapbase();
 417 
 418   DEBUG_ONLY(void verify_heapbase(const char* msg);)
 419 
 420 #endif // _LP64
 421 
 422   // Int division/remainder for Java
 423   // (as idivl, but checks for special case as described in JVM spec.)
 424   // returns idivl instruction offset for implicit exception handling
 425   int corrected_idivl(Register reg);
 426 
 427   // Long division/remainder for Java
 428   // (as idivq, but checks for special case as described in JVM spec.)
 429   // returns idivq instruction offset for implicit exception handling
 430   int corrected_idivq(Register reg);
 431 
 432   void int3();
 433 
 434   // Long operation macros for a 32bit cpu
 435   // Long negation for Java
 436   void lneg(Register hi, Register lo);
 437 
 438   // Long multiplication for Java
 439   // (destroys contents of eax, ebx, ecx and edx)
 440   void lmul(int x_rsp_offset, int y_rsp_offset); // rdx:rax = x * y
 441 
 442   // Long shifts for Java
 443   // (semantics as described in JVM spec.)
 444   void lshl(Register hi, Register lo);                               // hi:lo << (rcx & 0x3f)
 445   void lshr(Register hi, Register lo, bool sign_extension = false);  // hi:lo >> (rcx & 0x3f)
 446 
 447   // Long compare for Java
 448   // (semantics as described in JVM spec.)
 449   void lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo); // x_hi = lcmp(x, y)
 450 
 451 
 452   // misc
 453 
 454   // Sign extension
 455   void sign_extend_short(Register reg);
 456   void sign_extend_byte(Register reg);
 457 
 458   // Division by power of 2, rounding towards 0
 459   void division_with_shift(Register reg, int shift_value);
 460 
 461   // Compares the top-most stack entries on the FPU stack and sets the eflags as follows:
 462   //
 463   // CF (corresponds to C0) if x < y
 464   // PF (corresponds to C2) if unordered
 465   // ZF (corresponds to C3) if x = y
 466   //
 467   // The arguments are in reversed order on the stack (i.e., top of stack is first argument).
 468   // tmp is a temporary register, if none is available use noreg (only matters for non-P6 code)
 469   void fcmp(Register tmp);
 470   // Variant of the above which allows y to be further down the stack
 471   // and which only pops x and y if specified. If pop_right is
 472   // specified then pop_left must also be specified.
 473   void fcmp(Register tmp, int index, bool pop_left, bool pop_right);
 474 
 475   // Floating-point comparison for Java
 476   // Compares the top-most stack entries on the FPU stack and stores the result in dst.
 477   // The arguments are in reversed order on the stack (i.e., top of stack is first argument).
 478   // (semantics as described in JVM spec.)
 479   void fcmp2int(Register dst, bool unordered_is_less);
 480   // Variant of the above which allows y to be further down the stack
 481   // and which only pops x and y if specified. If pop_right is
 482   // specified then pop_left must also be specified.
 483   void fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right);
 484 
 485   // Floating-point remainder for Java (ST0 = ST0 fremr ST1, ST1 is empty afterwards)
 486   // tmp is a temporary register, if none is available use noreg
 487   void fremr(Register tmp);
 488 
 489   // dst = c = a * b + c
 490   void fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
 491   void fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
 492 
 493   void vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len);
 494   void vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len);
 495   void vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len);
 496   void vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len);
 497 
 498 
 499   // same as fcmp2int, but using SSE2
 500   void cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);
 501   void cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);
 502 
 503   // branch to L if FPU flag C2 is set/not set
 504   // tmp is a temporary register, if none is available use noreg
 505   void jC2 (Register tmp, Label& L);
 506   void jnC2(Register tmp, Label& L);
 507 
 508   // Pop ST (ffree & fincstp combined)
 509   void fpop();
 510 
 511   // Load float value from 'address'. If UseSSE >= 1, the value is loaded into
 512   // register xmm0. Otherwise, the value is loaded onto the FPU stack.
 513   void load_float(Address src);
 514 
 515   // Store float value to 'address'. If UseSSE >= 1, the value is stored
 516   // from register xmm0. Otherwise, the value is stored from the FPU stack.
 517   void store_float(Address dst);
 518 
 519   // Load double value from 'address'. If UseSSE >= 2, the value is loaded into
 520   // register xmm0. Otherwise, the value is loaded onto the FPU stack.
 521   void load_double(Address src);
 522 
 523   // Store double value to 'address'. If UseSSE >= 2, the value is stored
 524   // from register xmm0. Otherwise, the value is stored from the FPU stack.
 525   void store_double(Address dst);
 526 
 527   // pushes double TOS element of FPU stack on CPU stack; pops from FPU stack
 528   void push_fTOS();
 529 
 530   // pops double TOS element from CPU stack and pushes on FPU stack
 531   void pop_fTOS();
 532 
 533   void empty_FPU_stack();
 534 
 535   void push_IU_state();
 536   void pop_IU_state();
 537 
 538   void push_FPU_state();
 539   void pop_FPU_state();
 540 
 541   void push_CPU_state();
 542   void pop_CPU_state();
 543 
 544   // Round up to a power of two
 545   void round_to(Register reg, int modulus);
 546 
 547   // Callee saved registers handling
 548   void push_callee_saved_registers();
 549   void pop_callee_saved_registers();
 550 
 551   // allocation
 552 
 553   // Object / value buffer allocation...
 554   // Allocate instance of klass, assumes klass initialized by caller
 555   // new_obj prefers to be rax
 556   // Kills t1 and t2, perserves klass, return allocation in new_obj (rsi on LP64)
 557   void allocate_instance(Register klass, Register new_obj,
 558                          Register t1, Register t2,
 559                          bool clear_fields, Label& alloc_failed);
 560 
 561   void eden_allocate(
 562     Register thread,                   // Current thread
 563     Register obj,                      // result: pointer to object after successful allocation
 564     Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
 565     int      con_size_in_bytes,        // object size in bytes if   known at compile time
 566     Register t1,                       // temp register
 567     Label&   slow_case                 // continuation point if fast allocation fails
 568   );
 569   void tlab_allocate(
 570     Register thread,                   // Current thread
 571     Register obj,                      // result: pointer to object after successful allocation
 572     Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
 573     int      con_size_in_bytes,        // object size in bytes if   known at compile time
 574     Register t1,                       // temp register
 575     Register t2,                       // temp register
 576     Label&   slow_case                 // continuation point if fast allocation fails
 577   );
 578   void zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp);
 579 
 580   // For field "index" within "klass", return value_klass ...
 581   void get_value_field_klass(Register klass, Register index, Register value_klass);
 582 
 583   // interface method calling
 584   void lookup_interface_method(Register recv_klass,
 585                                Register intf_klass,
 586                                RegisterOrConstant itable_index,
 587                                Register method_result,
 588                                Register scan_temp,
 589                                Label& no_such_interface,
 590                                bool return_method = true);
 591 
 592   // virtual method calling
 593   void lookup_virtual_method(Register recv_klass,
 594                              RegisterOrConstant vtable_index,
 595                              Register method_result);
 596 
 597   // Test sub_klass against super_klass, with fast and slow paths.
 598 
 599   // The fast path produces a tri-state answer: yes / no / maybe-slow.
 600   // One of the three labels can be NULL, meaning take the fall-through.
 601   // If super_check_offset is -1, the value is loaded up from super_klass.
 602   // No registers are killed, except temp_reg.
 603   void check_klass_subtype_fast_path(Register sub_klass,
 604                                      Register super_klass,
 605                                      Register temp_reg,
 606                                      Label* L_success,
 607                                      Label* L_failure,
 608                                      Label* L_slow_path,
 609                 RegisterOrConstant super_check_offset = RegisterOrConstant(-1));
 610 
 611   // The rest of the type check; must be wired to a corresponding fast path.
 612   // It does not repeat the fast path logic, so don't use it standalone.
 613   // The temp_reg and temp2_reg can be noreg, if no temps are available.
 614   // Updates the sub's secondary super cache as necessary.
 615   // If set_cond_codes, condition codes will be Z on success, NZ on failure.
 616   void check_klass_subtype_slow_path(Register sub_klass,
 617                                      Register super_klass,
 618                                      Register temp_reg,
 619                                      Register temp2_reg,
 620                                      Label* L_success,
 621                                      Label* L_failure,
 622                                      bool set_cond_codes = false);
 623 
 624   // Simplified, combined version, good for typical uses.
 625   // Falls through on failure.
 626   void check_klass_subtype(Register sub_klass,
 627                            Register super_klass,
 628                            Register temp_reg,
 629                            Label& L_success);
 630 
 631   void clinit_barrier(Register klass,
 632                       Register thread,
 633                       Label* L_fast_path = NULL,
 634                       Label* L_slow_path = NULL);
 635 
 636   // method handles (JSR 292)
 637   Address argument_address(RegisterOrConstant arg_slot, int extra_slot_offset = 0);
 638 
 639   //----
 640   void set_word_if_not_zero(Register reg); // sets reg to 1 if not zero, otherwise 0
 641 
 642   // Debugging
 643 
 644   // only if +VerifyOops
 645   // TODO: Make these macros with file and line like sparc version!
 646   void verify_oop(Register reg, const char* s = "broken oop");
 647   void verify_oop_addr(Address addr, const char * s = "broken oop addr");
 648 
 649   // TODO: verify method and klass metadata (compare against vptr?)
 650   void _verify_method_ptr(Register reg, const char * msg, const char * file, int line) {}
 651   void _verify_klass_ptr(Register reg, const char * msg, const char * file, int line){}
 652 
 653 #define verify_method_ptr(reg) _verify_method_ptr(reg, "broken method " #reg, __FILE__, __LINE__)
 654 #define verify_klass_ptr(reg) _verify_klass_ptr(reg, "broken klass " #reg, __FILE__, __LINE__)
 655 
 656   // only if +VerifyFPU
 657   void verify_FPU(int stack_depth, const char* s = "illegal FPU state");
 658 
 659   // Verify or restore cpu control state after JNI call
 660   void restore_cpu_control_state_after_jni();
 661 
 662   // prints msg, dumps registers and stops execution
 663   void stop(const char* msg);
 664 
 665   // prints msg and continues
 666   void warn(const char* msg);
 667 
 668   // dumps registers and other state
 669   void print_state();
 670 
 671   static void debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg);
 672   static void debug64(char* msg, int64_t pc, int64_t regs[]);
 673   static void print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip);
 674   static void print_state64(int64_t pc, int64_t regs[]);
 675 
 676   void os_breakpoint();
 677 
 678   void untested()                                { stop("untested"); }
 679 
 680   void unimplemented(const char* what = "");
 681 
 682   void should_not_reach_here()                   { stop("should not reach here"); }
 683 
 684   void print_CPU_state();
 685 
 686   // Stack overflow checking
 687   void bang_stack_with_offset(int offset) {
 688     // stack grows down, caller passes positive offset
 689     assert(offset > 0, "must bang with negative offset");
 690     movl(Address(rsp, (-offset)), rax);
 691   }
 692 
 693   // Writes to stack successive pages until offset reached to check for
 694   // stack overflow + shadow pages.  Also, clobbers tmp
 695   void bang_stack_size(Register size, Register tmp);
 696 
 697   // Check for reserved stack access in method being exited (for JIT)
 698   void reserved_stack_check();
 699 
 700   virtual RegisterOrConstant delayed_value_impl(intptr_t* delayed_value_addr,
 701                                                 Register tmp,
 702                                                 int offset);
 703 
 704   // If thread_reg is != noreg the code assumes the register passed contains
 705   // the thread (required on 64 bit).
 706   void safepoint_poll(Label& slow_path, Register thread_reg, Register temp_reg);
 707 
 708   void verify_tlab();
 709 
 710   // Biased locking support
 711   // lock_reg and obj_reg must be loaded up with the appropriate values.
 712   // swap_reg must be rax, and is killed.
 713   // tmp_reg is optional. If it is supplied (i.e., != noreg) it will
 714   // be killed; if not supplied, push/pop will be used internally to
 715   // allocate a temporary (inefficient, avoid if possible).
 716   // Optional slow case is for implementations (interpreter and C1) which branch to
 717   // slow case directly. Leaves condition codes set for C2's Fast_Lock node.
 718   // Returns offset of first potentially-faulting instruction for null
 719   // check info (currently consumed only by C1). If
 720   // swap_reg_contains_mark is true then returns -1 as it is assumed
 721   // the calling code has already passed any potential faults.
 722   int biased_locking_enter(Register lock_reg, Register obj_reg,
 723                            Register swap_reg, Register tmp_reg,
 724                            bool swap_reg_contains_mark,
 725                            Label& done, Label* slow_case = NULL,
 726                            BiasedLockingCounters* counters = NULL);
 727   void biased_locking_exit (Register obj_reg, Register temp_reg, Label& done);
 728 #ifdef COMPILER2
 729   // Code used by cmpFastLock and cmpFastUnlock mach instructions in .ad file.
 730   // See full desription in macroAssembler_x86.cpp.
 731   void fast_lock(Register obj, Register box, Register tmp,
 732                  Register scr, Register cx1, Register cx2,
 733                  BiasedLockingCounters* counters,
 734                  RTMLockingCounters* rtm_counters,
 735                  RTMLockingCounters* stack_rtm_counters,
 736                  Metadata* method_data,
 737                  bool use_rtm, bool profile_rtm);
 738   void fast_unlock(Register obj, Register box, Register tmp, bool use_rtm);
 739 #if INCLUDE_RTM_OPT
 740   void rtm_counters_update(Register abort_status, Register rtm_counters);
 741   void branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel);
 742   void rtm_abort_ratio_calculation(Register tmp, Register rtm_counters_reg,
 743                                    RTMLockingCounters* rtm_counters,
 744                                    Metadata* method_data);
 745   void rtm_profiling(Register abort_status_Reg, Register rtm_counters_Reg,
 746                      RTMLockingCounters* rtm_counters, Metadata* method_data, bool profile_rtm);
 747   void rtm_retry_lock_on_abort(Register retry_count, Register abort_status, Label& retryLabel);
 748   void rtm_retry_lock_on_busy(Register retry_count, Register box, Register tmp, Register scr, Label& retryLabel);
 749   void rtm_stack_locking(Register obj, Register tmp, Register scr,
 750                          Register retry_on_abort_count,
 751                          RTMLockingCounters* stack_rtm_counters,
 752                          Metadata* method_data, bool profile_rtm,
 753                          Label& DONE_LABEL, Label& IsInflated);
 754   void rtm_inflated_locking(Register obj, Register box, Register tmp,
 755                             Register scr, Register retry_on_busy_count,
 756                             Register retry_on_abort_count,
 757                             RTMLockingCounters* rtm_counters,
 758                             Metadata* method_data, bool profile_rtm,
 759                             Label& DONE_LABEL);
 760 #endif
 761 #endif
 762 
 763   Condition negate_condition(Condition cond);
 764 
 765   // Instructions that use AddressLiteral operands. These instruction can handle 32bit/64bit
 766   // operands. In general the names are modified to avoid hiding the instruction in Assembler
 767   // so that we don't need to implement all the varieties in the Assembler with trivial wrappers
 768   // here in MacroAssembler. The major exception to this rule is call
 769 
 770   // Arithmetics
 771 
 772 
 773   void addptr(Address dst, int32_t src) { LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)) ; }
 774   void addptr(Address dst, Register src);
 775 
 776   void addptr(Register dst, Address src) { LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)); }
 777   void addptr(Register dst, int32_t src);
 778   void addptr(Register dst, Register src);
 779   void addptr(Register dst, RegisterOrConstant src) {
 780     if (src.is_constant()) addptr(dst, (int) src.as_constant());
 781     else                   addptr(dst,       src.as_register());
 782   }
 783 
 784   void andptr(Register dst, int32_t src);
 785   void andptr(Register src1, Register src2) { LP64_ONLY(andq(src1, src2)) NOT_LP64(andl(src1, src2)) ; }
 786 
 787   void cmp8(AddressLiteral src1, int imm);
 788 
 789   // renamed to drag out the casting of address to int32_t/intptr_t
 790   void cmp32(Register src1, int32_t imm);
 791 
 792   void cmp32(AddressLiteral src1, int32_t imm);
 793   // compare reg - mem, or reg - &mem
 794   void cmp32(Register src1, AddressLiteral src2);
 795 
 796   void cmp32(Register src1, Address src2);
 797 
 798 #ifndef _LP64
 799   void cmpklass(Address dst, Metadata* obj);
 800   void cmpklass(Register dst, Metadata* obj);
 801   void cmpoop(Address dst, jobject obj);
 802   void cmpoop_raw(Address dst, jobject obj);
 803 #endif // _LP64
 804 
 805   void cmpoop(Register src1, Register src2);
 806   void cmpoop(Register src1, Address src2);
 807   void cmpoop(Register dst, jobject obj);
 808   void cmpoop_raw(Register dst, jobject obj);
 809 
 810   // NOTE src2 must be the lval. This is NOT an mem-mem compare
 811   void cmpptr(Address src1, AddressLiteral src2);
 812 
 813   void cmpptr(Register src1, AddressLiteral src2);
 814 
 815   void cmpptr(Register src1, Register src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 816   void cmpptr(Register src1, Address src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 817   // void cmpptr(Address src1, Register src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 818 
 819   void cmpptr(Register src1, int32_t src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 820   void cmpptr(Address src1, int32_t src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
 821 
 822   // cmp64 to avoild hiding cmpq
 823   void cmp64(Register src1, AddressLiteral src);
 824 
 825   void cmpxchgptr(Register reg, Address adr);
 826 
 827   void locked_cmpxchgptr(Register reg, AddressLiteral adr);
 828 
 829 
 830   void imulptr(Register dst, Register src) { LP64_ONLY(imulq(dst, src)) NOT_LP64(imull(dst, src)); }
 831   void imulptr(Register dst, Register src, int imm32) { LP64_ONLY(imulq(dst, src, imm32)) NOT_LP64(imull(dst, src, imm32)); }
 832 
 833 
 834   void negptr(Register dst) { LP64_ONLY(negq(dst)) NOT_LP64(negl(dst)); }
 835 
 836   void notptr(Register dst) { LP64_ONLY(notq(dst)) NOT_LP64(notl(dst)); }
 837 
 838   void shlptr(Register dst, int32_t shift);
 839   void shlptr(Register dst) { LP64_ONLY(shlq(dst)) NOT_LP64(shll(dst)); }
 840 
 841   void shrptr(Register dst, int32_t shift);
 842   void shrptr(Register dst) { LP64_ONLY(shrq(dst)) NOT_LP64(shrl(dst)); }
 843 
 844   void sarptr(Register dst) { LP64_ONLY(sarq(dst)) NOT_LP64(sarl(dst)); }
 845   void sarptr(Register dst, int32_t src) { LP64_ONLY(sarq(dst, src)) NOT_LP64(sarl(dst, src)); }
 846 
 847   void subptr(Address dst, int32_t src) { LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); }
 848 
 849   void subptr(Register dst, Address src) { LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); }
 850   void subptr(Register dst, int32_t src);
 851   // Force generation of a 4 byte immediate value even if it fits into 8bit
 852   void subptr_imm32(Register dst, int32_t src);
 853   void subptr(Register dst, Register src);
 854   void subptr(Register dst, RegisterOrConstant src) {
 855     if (src.is_constant()) subptr(dst, (int) src.as_constant());
 856     else                   subptr(dst,       src.as_register());
 857   }
 858 
 859   void sbbptr(Address dst, int32_t src) { LP64_ONLY(sbbq(dst, src)) NOT_LP64(sbbl(dst, src)); }
 860   void sbbptr(Register dst, int32_t src) { LP64_ONLY(sbbq(dst, src)) NOT_LP64(sbbl(dst, src)); }
 861 
 862   void xchgptr(Register src1, Register src2) { LP64_ONLY(xchgq(src1, src2)) NOT_LP64(xchgl(src1, src2)) ; }
 863   void xchgptr(Register src1, Address src2) { LP64_ONLY(xchgq(src1, src2)) NOT_LP64(xchgl(src1, src2)) ; }
 864 
 865   void xaddptr(Address src1, Register src2) { LP64_ONLY(xaddq(src1, src2)) NOT_LP64(xaddl(src1, src2)) ; }
 866 
 867 
 868 
 869   // Helper functions for statistics gathering.
 870   // Conditionally (atomically, on MPs) increments passed counter address, preserving condition codes.
 871   void cond_inc32(Condition cond, AddressLiteral counter_addr);
 872   // Unconditional atomic increment.
 873   void atomic_incl(Address counter_addr);
 874   void atomic_incl(AddressLiteral counter_addr, Register scr = rscratch1);
 875 #ifdef _LP64
 876   void atomic_incq(Address counter_addr);
 877   void atomic_incq(AddressLiteral counter_addr, Register scr = rscratch1);
 878 #endif
 879   void atomic_incptr(AddressLiteral counter_addr, Register scr = rscratch1) { LP64_ONLY(atomic_incq(counter_addr, scr)) NOT_LP64(atomic_incl(counter_addr, scr)) ; }
 880   void atomic_incptr(Address counter_addr) { LP64_ONLY(atomic_incq(counter_addr)) NOT_LP64(atomic_incl(counter_addr)) ; }
 881 
 882   void lea(Register dst, AddressLiteral adr);
 883   void lea(Address dst, AddressLiteral adr);
 884   void lea(Register dst, Address adr) { Assembler::lea(dst, adr); }
 885 
 886   void leal32(Register dst, Address src) { leal(dst, src); }
 887 
 888   // Import other testl() methods from the parent class or else
 889   // they will be hidden by the following overriding declaration.
 890   using Assembler::testl;
 891   void testl(Register dst, AddressLiteral src);
 892 
 893   void orptr(Register dst, Address src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
 894   void orptr(Register dst, Register src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
 895   void orptr(Register dst, int32_t src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
 896   void orptr(Address dst, int32_t imm32) { LP64_ONLY(orq(dst, imm32)) NOT_LP64(orl(dst, imm32)); }
 897 
 898   void testptr(Register src, int32_t imm32) {  LP64_ONLY(testq(src, imm32)) NOT_LP64(testl(src, imm32)); }
 899   void testptr(Register src1, Address src2) { LP64_ONLY(testq(src1, src2)) NOT_LP64(testl(src1, src2)); }
 900   void testptr(Register src1, Register src2);
 901 
 902   void xorptr(Register dst, Register src) { LP64_ONLY(xorq(dst, src)) NOT_LP64(xorl(dst, src)); }
 903   void xorptr(Register dst, Address src) { LP64_ONLY(xorq(dst, src)) NOT_LP64(xorl(dst, src)); }
 904 
 905   // Calls
 906 
 907   void call(Label& L, relocInfo::relocType rtype);
 908   void call(Register entry);
 909 
 910   // NOTE: this call transfers to the effective address of entry NOT
 911   // the address contained by entry. This is because this is more natural
 912   // for jumps/calls.
 913   void call(AddressLiteral entry);
 914 
 915   // Emit the CompiledIC call idiom
 916   void ic_call(address entry, jint method_index = 0);
 917 
 918   // Jumps
 919 
 920   // NOTE: these jumps tranfer to the effective address of dst NOT
 921   // the address contained by dst. This is because this is more natural
 922   // for jumps/calls.
 923   void jump(AddressLiteral dst);
 924   void jump_cc(Condition cc, AddressLiteral dst);
 925 
 926   // 32bit can do a case table jump in one instruction but we no longer allow the base
 927   // to be installed in the Address class. This jump will tranfers to the address
 928   // contained in the location described by entry (not the address of entry)
 929   void jump(ArrayAddress entry);
 930 
 931   // Floating
 932 
 933   void andpd(XMMRegister dst, Address src) { Assembler::andpd(dst, src); }
 934   void andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1);
 935   void andpd(XMMRegister dst, XMMRegister src) { Assembler::andpd(dst, src); }
 936 
 937   void andps(XMMRegister dst, XMMRegister src) { Assembler::andps(dst, src); }
 938   void andps(XMMRegister dst, Address src) { Assembler::andps(dst, src); }
 939   void andps(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1);
 940 
 941   void comiss(XMMRegister dst, XMMRegister src) { Assembler::comiss(dst, src); }
 942   void comiss(XMMRegister dst, Address src) { Assembler::comiss(dst, src); }
 943   void comiss(XMMRegister dst, AddressLiteral src);
 944 
 945   void comisd(XMMRegister dst, XMMRegister src) { Assembler::comisd(dst, src); }
 946   void comisd(XMMRegister dst, Address src) { Assembler::comisd(dst, src); }
 947   void comisd(XMMRegister dst, AddressLiteral src);
 948 
 949   void fadd_s(Address src)        { Assembler::fadd_s(src); }
 950   void fadd_s(AddressLiteral src) { Assembler::fadd_s(as_Address(src)); }
 951 
 952   void fldcw(Address src) { Assembler::fldcw(src); }
 953   void fldcw(AddressLiteral src);
 954 
 955   void fld_s(int index)   { Assembler::fld_s(index); }
 956   void fld_s(Address src) { Assembler::fld_s(src); }
 957   void fld_s(AddressLiteral src);
 958 
 959   void fld_d(Address src) { Assembler::fld_d(src); }
 960   void fld_d(AddressLiteral src);
 961 
 962   void fld_x(Address src) { Assembler::fld_x(src); }
 963   void fld_x(AddressLiteral src);
 964 
 965   void fmul_s(Address src)        { Assembler::fmul_s(src); }
 966   void fmul_s(AddressLiteral src) { Assembler::fmul_s(as_Address(src)); }
 967 
 968   void ldmxcsr(Address src) { Assembler::ldmxcsr(src); }
 969   void ldmxcsr(AddressLiteral src);
 970 
 971 #ifdef _LP64
 972  private:
 973   void sha256_AVX2_one_round_compute(
 974     Register  reg_old_h,
 975     Register  reg_a,
 976     Register  reg_b,
 977     Register  reg_c,
 978     Register  reg_d,
 979     Register  reg_e,
 980     Register  reg_f,
 981     Register  reg_g,
 982     Register  reg_h,
 983     int iter);
 984   void sha256_AVX2_four_rounds_compute_first(int start);
 985   void sha256_AVX2_four_rounds_compute_last(int start);
 986   void sha256_AVX2_one_round_and_sched(
 987         XMMRegister xmm_0,     /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */
 988         XMMRegister xmm_1,     /* ymm5 */  /* full cycle is 16 iterations */
 989         XMMRegister xmm_2,     /* ymm6 */
 990         XMMRegister xmm_3,     /* ymm7 */
 991         Register    reg_a,      /* == eax on 0 iteration, then rotate 8 register right on each next iteration */
 992         Register    reg_b,      /* ebx */    /* full cycle is 8 iterations */
 993         Register    reg_c,      /* edi */
 994         Register    reg_d,      /* esi */
 995         Register    reg_e,      /* r8d */
 996         Register    reg_f,      /* r9d */
 997         Register    reg_g,      /* r10d */
 998         Register    reg_h,      /* r11d */
 999         int iter);
1000 
1001   void addm(int disp, Register r1, Register r2);
1002   void gfmul(XMMRegister tmp0, XMMRegister t);
1003   void schoolbookAAD(int i, Register subkeyH, XMMRegister data, XMMRegister tmp0,
1004                      XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3);
1005   void generateHtbl_one_block(Register htbl);
1006   void generateHtbl_eight_blocks(Register htbl);
1007  public:
1008   void sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1009                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1010                    Register buf, Register state, Register ofs, Register limit, Register rsp,
1011                    bool multi_block, XMMRegister shuf_mask);
1012   void avx_ghash(Register state, Register htbl, Register data, Register blocks);
1013 #endif
1014 
1015 #ifdef _LP64
1016  private:
1017   void sha512_AVX2_one_round_compute(Register old_h, Register a, Register b, Register c, Register d,
1018                                      Register e, Register f, Register g, Register h, int iteration);
1019 
1020   void sha512_AVX2_one_round_and_schedule(XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1021                                           Register a, Register b, Register c, Register d, Register e, Register f,
1022                                           Register g, Register h, int iteration);
1023 
1024   void addmq(int disp, Register r1, Register r2);
1025  public:
1026   void sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1027                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1028                    Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block,
1029                    XMMRegister shuf_mask);
1030 private:
1031   void roundEnc(XMMRegister key, int rnum);
1032   void lastroundEnc(XMMRegister key, int rnum);
1033   void roundDec(XMMRegister key, int rnum);
1034   void lastroundDec(XMMRegister key, int rnum);
1035   void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask);
1036 
1037 public:
1038   void aesecb_encrypt(Register source_addr, Register dest_addr, Register key, Register len);
1039   void aesecb_decrypt(Register source_addr, Register dest_addr, Register key, Register len);
1040   void aesctr_encrypt(Register src_addr, Register dest_addr, Register key, Register counter,
1041                       Register len_reg, Register used, Register used_addr, Register saved_encCounter_start);
1042 
1043 #endif
1044 
1045   void fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
1046                  XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
1047                  Register buf, Register state, Register ofs, Register limit, Register rsp,
1048                  bool multi_block);
1049 
1050 #ifdef _LP64
1051   void fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1052                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1053                    Register buf, Register state, Register ofs, Register limit, Register rsp,
1054                    bool multi_block, XMMRegister shuf_mask);
1055 #else
1056   void fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
1057                    XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
1058                    Register buf, Register state, Register ofs, Register limit, Register rsp,
1059                    bool multi_block);
1060 #endif
1061 
1062   void fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1063                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1064                 Register rax, Register rcx, Register rdx, Register tmp);
1065 
1066 #ifdef _LP64
1067   void fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1068                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1069                 Register rax, Register rcx, Register rdx, Register tmp1, Register tmp2);
1070 
1071   void fast_log10(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1072                   XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1073                   Register rax, Register rcx, Register rdx, Register r11);
1074 
1075   void fast_pow(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
1076                 XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register rax, Register rcx,
1077                 Register rdx, Register tmp1, Register tmp2, Register tmp3, Register tmp4);
1078 
1079   void fast_sin(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1080                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1081                 Register rax, Register rbx, Register rcx, Register rdx, Register tmp1, Register tmp2,
1082                 Register tmp3, Register tmp4);
1083 
1084   void fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1085                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1086                 Register rax, Register rcx, Register rdx, Register tmp1,
1087                 Register tmp2, Register tmp3, Register tmp4);
1088   void fast_tan(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1089                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1090                 Register rax, Register rcx, Register rdx, Register tmp1,
1091                 Register tmp2, Register tmp3, Register tmp4);
1092 #else
1093   void fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1094                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1095                 Register rax, Register rcx, Register rdx, Register tmp1);
1096 
1097   void fast_log10(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1098                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1099                 Register rax, Register rcx, Register rdx, Register tmp);
1100 
1101   void fast_pow(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
1102                 XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register rax, Register rcx,
1103                 Register rdx, Register tmp);
1104 
1105   void fast_sin(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1106                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1107                 Register rax, Register rbx, Register rdx);
1108 
1109   void fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1110                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1111                 Register rax, Register rcx, Register rdx, Register tmp);
1112 
1113   void libm_sincos_huge(XMMRegister xmm0, XMMRegister xmm1, Register eax, Register ecx,
1114                         Register edx, Register ebx, Register esi, Register edi,
1115                         Register ebp, Register esp);
1116 
1117   void libm_reduce_pi04l(Register eax, Register ecx, Register edx, Register ebx,
1118                          Register esi, Register edi, Register ebp, Register esp);
1119 
1120   void libm_tancot_huge(XMMRegister xmm0, XMMRegister xmm1, Register eax, Register ecx,
1121                         Register edx, Register ebx, Register esi, Register edi,
1122                         Register ebp, Register esp);
1123 
1124   void fast_tan(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1125                 XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1126                 Register rax, Register rcx, Register rdx, Register tmp);
1127 #endif
1128 
1129   void increase_precision();
1130   void restore_precision();
1131 
1132 private:
1133 
1134   // these are private because users should be doing movflt/movdbl
1135 
1136   void movss(XMMRegister dst, XMMRegister src) { Assembler::movss(dst, src); }
1137   void movss(Address dst, XMMRegister src)     { Assembler::movss(dst, src); }
1138   void movss(XMMRegister dst, Address src)     { Assembler::movss(dst, src); }
1139   void movss(XMMRegister dst, AddressLiteral src);
1140 
1141   void movlpd(XMMRegister dst, Address src)    {Assembler::movlpd(dst, src); }
1142   void movlpd(XMMRegister dst, AddressLiteral src);
1143 
1144 public:
1145 
1146   void addsd(XMMRegister dst, XMMRegister src)    { Assembler::addsd(dst, src); }
1147   void addsd(XMMRegister dst, Address src)        { Assembler::addsd(dst, src); }
1148   void addsd(XMMRegister dst, AddressLiteral src);
1149 
1150   void addss(XMMRegister dst, XMMRegister src)    { Assembler::addss(dst, src); }
1151   void addss(XMMRegister dst, Address src)        { Assembler::addss(dst, src); }
1152   void addss(XMMRegister dst, AddressLiteral src);
1153 
1154   void addpd(XMMRegister dst, XMMRegister src)    { Assembler::addpd(dst, src); }
1155   void addpd(XMMRegister dst, Address src)        { Assembler::addpd(dst, src); }
1156   void addpd(XMMRegister dst, AddressLiteral src);
1157 
1158   void divsd(XMMRegister dst, XMMRegister src)    { Assembler::divsd(dst, src); }
1159   void divsd(XMMRegister dst, Address src)        { Assembler::divsd(dst, src); }
1160   void divsd(XMMRegister dst, AddressLiteral src);
1161 
1162   void divss(XMMRegister dst, XMMRegister src)    { Assembler::divss(dst, src); }
1163   void divss(XMMRegister dst, Address src)        { Assembler::divss(dst, src); }
1164   void divss(XMMRegister dst, AddressLiteral src);
1165 
1166   // Move Unaligned Double Quadword
1167   void movdqu(Address     dst, XMMRegister src);
1168   void movdqu(XMMRegister dst, Address src);
1169   void movdqu(XMMRegister dst, XMMRegister src);
1170   void movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg = rscratch1);
1171   // AVX Unaligned forms
1172   void vmovdqu(Address     dst, XMMRegister src);
1173   void vmovdqu(XMMRegister dst, Address src);
1174   void vmovdqu(XMMRegister dst, XMMRegister src);
1175   void vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1);
1176   void evmovdquq(XMMRegister dst, Address src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); }
1177   void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); }
1178   void evmovdquq(Address dst, XMMRegister src, int vector_len) { Assembler::evmovdquq(dst, src, vector_len); }
1179   void evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch);
1180 
1181   // Move Aligned Double Quadword
1182   void movdqa(XMMRegister dst, Address src)       { Assembler::movdqa(dst, src); }
1183   void movdqa(XMMRegister dst, XMMRegister src)   { Assembler::movdqa(dst, src); }
1184   void movdqa(XMMRegister dst, AddressLiteral src);
1185 
1186   void movsd(XMMRegister dst, XMMRegister src) { Assembler::movsd(dst, src); }
1187   void movsd(Address dst, XMMRegister src)     { Assembler::movsd(dst, src); }
1188   void movsd(XMMRegister dst, Address src)     { Assembler::movsd(dst, src); }
1189   void movsd(XMMRegister dst, AddressLiteral src);
1190 
1191   void mulpd(XMMRegister dst, XMMRegister src)    { Assembler::mulpd(dst, src); }
1192   void mulpd(XMMRegister dst, Address src)        { Assembler::mulpd(dst, src); }
1193   void mulpd(XMMRegister dst, AddressLiteral src);
1194 
1195   void mulsd(XMMRegister dst, XMMRegister src)    { Assembler::mulsd(dst, src); }
1196   void mulsd(XMMRegister dst, Address src)        { Assembler::mulsd(dst, src); }
1197   void mulsd(XMMRegister dst, AddressLiteral src);
1198 
1199   void mulss(XMMRegister dst, XMMRegister src)    { Assembler::mulss(dst, src); }
1200   void mulss(XMMRegister dst, Address src)        { Assembler::mulss(dst, src); }
1201   void mulss(XMMRegister dst, AddressLiteral src);
1202 
1203   // Carry-Less Multiplication Quadword
1204   void pclmulldq(XMMRegister dst, XMMRegister src) {
1205     // 0x00 - multiply lower 64 bits [0:63]
1206     Assembler::pclmulqdq(dst, src, 0x00);
1207   }
1208   void pclmulhdq(XMMRegister dst, XMMRegister src) {
1209     // 0x11 - multiply upper 64 bits [64:127]
1210     Assembler::pclmulqdq(dst, src, 0x11);
1211   }
1212 
1213   void pcmpeqb(XMMRegister dst, XMMRegister src);
1214   void pcmpeqw(XMMRegister dst, XMMRegister src);
1215 
1216   void pcmpestri(XMMRegister dst, Address src, int imm8);
1217   void pcmpestri(XMMRegister dst, XMMRegister src, int imm8);
1218 
1219   void pmovzxbw(XMMRegister dst, XMMRegister src);
1220   void pmovzxbw(XMMRegister dst, Address src);
1221 
1222   void pmovmskb(Register dst, XMMRegister src);
1223 
1224   void ptest(XMMRegister dst, XMMRegister src);
1225 
1226   void sqrtsd(XMMRegister dst, XMMRegister src)    { Assembler::sqrtsd(dst, src); }
1227   void sqrtsd(XMMRegister dst, Address src)        { Assembler::sqrtsd(dst, src); }
1228   void sqrtsd(XMMRegister dst, AddressLiteral src);
1229 
1230   void roundsd(XMMRegister dst, XMMRegister src, int32_t rmode)    { Assembler::roundsd(dst, src, rmode); }
1231   void roundsd(XMMRegister dst, Address src, int32_t rmode)        { Assembler::roundsd(dst, src, rmode); }
1232   void roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register scratch_reg);
1233 
1234   void sqrtss(XMMRegister dst, XMMRegister src)    { Assembler::sqrtss(dst, src); }
1235   void sqrtss(XMMRegister dst, Address src)        { Assembler::sqrtss(dst, src); }
1236   void sqrtss(XMMRegister dst, AddressLiteral src);
1237 
1238   void subsd(XMMRegister dst, XMMRegister src)    { Assembler::subsd(dst, src); }
1239   void subsd(XMMRegister dst, Address src)        { Assembler::subsd(dst, src); }
1240   void subsd(XMMRegister dst, AddressLiteral src);
1241 
1242   void subss(XMMRegister dst, XMMRegister src)    { Assembler::subss(dst, src); }
1243   void subss(XMMRegister dst, Address src)        { Assembler::subss(dst, src); }
1244   void subss(XMMRegister dst, AddressLiteral src);
1245 
1246   void ucomiss(XMMRegister dst, XMMRegister src) { Assembler::ucomiss(dst, src); }
1247   void ucomiss(XMMRegister dst, Address src)     { Assembler::ucomiss(dst, src); }
1248   void ucomiss(XMMRegister dst, AddressLiteral src);
1249 
1250   void ucomisd(XMMRegister dst, XMMRegister src) { Assembler::ucomisd(dst, src); }
1251   void ucomisd(XMMRegister dst, Address src)     { Assembler::ucomisd(dst, src); }
1252   void ucomisd(XMMRegister dst, AddressLiteral src);
1253 
1254   // Bitwise Logical XOR of Packed Double-Precision Floating-Point Values
1255   void xorpd(XMMRegister dst, XMMRegister src);
1256   void xorpd(XMMRegister dst, Address src)     { Assembler::xorpd(dst, src); }
1257   void xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1);
1258 
1259   // Bitwise Logical XOR of Packed Single-Precision Floating-Point Values
1260   void xorps(XMMRegister dst, XMMRegister src);
1261   void xorps(XMMRegister dst, Address src)     { Assembler::xorps(dst, src); }
1262   void xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1);
1263 
1264   // Shuffle Bytes
1265   void pshufb(XMMRegister dst, XMMRegister src) { Assembler::pshufb(dst, src); }
1266   void pshufb(XMMRegister dst, Address src)     { Assembler::pshufb(dst, src); }
1267   void pshufb(XMMRegister dst, AddressLiteral src);
1268   // AVX 3-operands instructions
1269 
1270   void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vaddsd(dst, nds, src); }
1271   void vaddsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vaddsd(dst, nds, src); }
1272   void vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1273 
1274   void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vaddss(dst, nds, src); }
1275   void vaddss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vaddss(dst, nds, src); }
1276   void vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1277 
1278   void vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len);
1279   void vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len);
1280 
1281   void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1282   void vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1283 
1284   void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1285   void vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1286 
1287   void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vpaddd(dst, nds, src, vector_len); }
1288   void vpaddd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vpaddd(dst, nds, src, vector_len); }
1289   void vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch);
1290 
1291   void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
1292   void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
1293   void vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1);
1294 
1295   void vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len);
1296   void vpbroadcastw(XMMRegister dst, Address src, int vector_len) { Assembler::vpbroadcastw(dst, src, vector_len); }
1297 
1298   void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1299 
1300   void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1301 
1302   void vpmovzxbw(XMMRegister dst, Address src, int vector_len);
1303   void vpmovzxbw(XMMRegister dst, XMMRegister src, int vector_len) { Assembler::vpmovzxbw(dst, src, vector_len); }
1304 
1305   void vpmovmskb(Register dst, XMMRegister src);
1306 
1307   void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1308   void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1309 
1310   void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1311   void vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1312 
1313   void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1314   void vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1315 
1316   void vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1317   void vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
1318 
1319   void evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1320   void evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
1321 
1322   void vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1323   void vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
1324 
1325   void vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1326   void vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
1327 
1328   void vptest(XMMRegister dst, XMMRegister src);
1329 
1330   void punpcklbw(XMMRegister dst, XMMRegister src);
1331   void punpcklbw(XMMRegister dst, Address src) { Assembler::punpcklbw(dst, src); }
1332 
1333   void pshufd(XMMRegister dst, Address src, int mode);
1334   void pshufd(XMMRegister dst, XMMRegister src, int mode) { Assembler::pshufd(dst, src, mode); }
1335 
1336   void pshuflw(XMMRegister dst, XMMRegister src, int mode);
1337   void pshuflw(XMMRegister dst, Address src, int mode) { Assembler::pshuflw(dst, src, mode); }
1338 
1339   void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); }
1340   void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len)     { Assembler::vandpd(dst, nds, src, vector_len); }
1341   void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1);
1342 
1343   void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); }
1344   void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len)     { Assembler::vandps(dst, nds, src, vector_len); }
1345   void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1);
1346 
1347   void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vdivsd(dst, nds, src); }
1348   void vdivsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vdivsd(dst, nds, src); }
1349   void vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1350 
1351   void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vdivss(dst, nds, src); }
1352   void vdivss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vdivss(dst, nds, src); }
1353   void vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1354 
1355   void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vmulsd(dst, nds, src); }
1356   void vmulsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vmulsd(dst, nds, src); }
1357   void vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1358 
1359   void vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vmulss(dst, nds, src); }
1360   void vmulss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vmulss(dst, nds, src); }
1361   void vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1362 
1363   void vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vsubsd(dst, nds, src); }
1364   void vsubsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vsubsd(dst, nds, src); }
1365   void vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1366 
1367   void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vsubss(dst, nds, src); }
1368   void vsubss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vsubss(dst, nds, src); }
1369   void vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1370 
1371   void vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1372   void vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1373 
1374   // AVX Vector instructions
1375 
1376   void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
1377   void vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
1378   void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1);
1379 
1380   void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); }
1381   void vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); }
1382   void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1);
1383 
1384   void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1385     if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2
1386       Assembler::vpxor(dst, nds, src, vector_len);
1387     else
1388       Assembler::vxorpd(dst, nds, src, vector_len);
1389   }
1390   void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
1391     if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2
1392       Assembler::vpxor(dst, nds, src, vector_len);
1393     else
1394       Assembler::vxorpd(dst, nds, src, vector_len);
1395   }
1396   void vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg = rscratch1);
1397 
1398   // Simple version for AVX2 256bit vectors
1399   void vpxor(XMMRegister dst, XMMRegister src) { Assembler::vpxor(dst, dst, src, true); }
1400   void vpxor(XMMRegister dst, Address src) { Assembler::vpxor(dst, dst, src, true); }
1401 
1402   void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
1403     if (UseAVX > 2) {
1404       Assembler::vinserti32x4(dst, dst, src, imm8);
1405     } else if (UseAVX > 1) {
1406       // vinserti128 is available only in AVX2
1407       Assembler::vinserti128(dst, nds, src, imm8);
1408     } else {
1409       Assembler::vinsertf128(dst, nds, src, imm8);
1410     }
1411   }
1412 
1413   void vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
1414     if (UseAVX > 2) {
1415       Assembler::vinserti32x4(dst, dst, src, imm8);
1416     } else if (UseAVX > 1) {
1417       // vinserti128 is available only in AVX2
1418       Assembler::vinserti128(dst, nds, src, imm8);
1419     } else {
1420       Assembler::vinsertf128(dst, nds, src, imm8);
1421     }
1422   }
1423 
1424   void vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
1425     if (UseAVX > 2) {
1426       Assembler::vextracti32x4(dst, src, imm8);
1427     } else if (UseAVX > 1) {
1428       // vextracti128 is available only in AVX2
1429       Assembler::vextracti128(dst, src, imm8);
1430     } else {
1431       Assembler::vextractf128(dst, src, imm8);
1432     }
1433   }
1434 
1435   void vextracti128(Address dst, XMMRegister src, uint8_t imm8) {
1436     if (UseAVX > 2) {
1437       Assembler::vextracti32x4(dst, src, imm8);
1438     } else if (UseAVX > 1) {
1439       // vextracti128 is available only in AVX2
1440       Assembler::vextracti128(dst, src, imm8);
1441     } else {
1442       Assembler::vextractf128(dst, src, imm8);
1443     }
1444   }
1445 
1446   // 128bit copy to/from high 128 bits of 256bit (YMM) vector registers
1447   void vinserti128_high(XMMRegister dst, XMMRegister src) {
1448     vinserti128(dst, dst, src, 1);
1449   }
1450   void vinserti128_high(XMMRegister dst, Address src) {
1451     vinserti128(dst, dst, src, 1);
1452   }
1453   void vextracti128_high(XMMRegister dst, XMMRegister src) {
1454     vextracti128(dst, src, 1);
1455   }
1456   void vextracti128_high(Address dst, XMMRegister src) {
1457     vextracti128(dst, src, 1);
1458   }
1459 
1460   void vinsertf128_high(XMMRegister dst, XMMRegister src) {
1461     if (UseAVX > 2) {
1462       Assembler::vinsertf32x4(dst, dst, src, 1);
1463     } else {
1464       Assembler::vinsertf128(dst, dst, src, 1);
1465     }
1466   }
1467 
1468   void vinsertf128_high(XMMRegister dst, Address src) {
1469     if (UseAVX > 2) {
1470       Assembler::vinsertf32x4(dst, dst, src, 1);
1471     } else {
1472       Assembler::vinsertf128(dst, dst, src, 1);
1473     }
1474   }
1475 
1476   void vextractf128_high(XMMRegister dst, XMMRegister src) {
1477     if (UseAVX > 2) {
1478       Assembler::vextractf32x4(dst, src, 1);
1479     } else {
1480       Assembler::vextractf128(dst, src, 1);
1481     }
1482   }
1483 
1484   void vextractf128_high(Address dst, XMMRegister src) {
1485     if (UseAVX > 2) {
1486       Assembler::vextractf32x4(dst, src, 1);
1487     } else {
1488       Assembler::vextractf128(dst, src, 1);
1489     }
1490   }
1491 
1492   // 256bit copy to/from high 256 bits of 512bit (ZMM) vector registers
1493   void vinserti64x4_high(XMMRegister dst, XMMRegister src) {
1494     Assembler::vinserti64x4(dst, dst, src, 1);
1495   }
1496   void vinsertf64x4_high(XMMRegister dst, XMMRegister src) {
1497     Assembler::vinsertf64x4(dst, dst, src, 1);
1498   }
1499   void vextracti64x4_high(XMMRegister dst, XMMRegister src) {
1500     Assembler::vextracti64x4(dst, src, 1);
1501   }
1502   void vextractf64x4_high(XMMRegister dst, XMMRegister src) {
1503     Assembler::vextractf64x4(dst, src, 1);
1504   }
1505   void vextractf64x4_high(Address dst, XMMRegister src) {
1506     Assembler::vextractf64x4(dst, src, 1);
1507   }
1508   void vinsertf64x4_high(XMMRegister dst, Address src) {
1509     Assembler::vinsertf64x4(dst, dst, src, 1);
1510   }
1511 
1512   // 128bit copy to/from low 128 bits of 256bit (YMM) vector registers
1513   void vinserti128_low(XMMRegister dst, XMMRegister src) {
1514     vinserti128(dst, dst, src, 0);
1515   }
1516   void vinserti128_low(XMMRegister dst, Address src) {
1517     vinserti128(dst, dst, src, 0);
1518   }
1519   void vextracti128_low(XMMRegister dst, XMMRegister src) {
1520     vextracti128(dst, src, 0);
1521   }
1522   void vextracti128_low(Address dst, XMMRegister src) {
1523     vextracti128(dst, src, 0);
1524   }
1525 
1526   void vinsertf128_low(XMMRegister dst, XMMRegister src) {
1527     if (UseAVX > 2) {
1528       Assembler::vinsertf32x4(dst, dst, src, 0);
1529     } else {
1530       Assembler::vinsertf128(dst, dst, src, 0);
1531     }
1532   }
1533 
1534   void vinsertf128_low(XMMRegister dst, Address src) {
1535     if (UseAVX > 2) {
1536       Assembler::vinsertf32x4(dst, dst, src, 0);
1537     } else {
1538       Assembler::vinsertf128(dst, dst, src, 0);
1539     }
1540   }
1541 
1542   void vextractf128_low(XMMRegister dst, XMMRegister src) {
1543     if (UseAVX > 2) {
1544       Assembler::vextractf32x4(dst, src, 0);
1545     } else {
1546       Assembler::vextractf128(dst, src, 0);
1547     }
1548   }
1549 
1550   void vextractf128_low(Address dst, XMMRegister src) {
1551     if (UseAVX > 2) {
1552       Assembler::vextractf32x4(dst, src, 0);
1553     } else {
1554       Assembler::vextractf128(dst, src, 0);
1555     }
1556   }
1557 
1558   // 256bit copy to/from low 256 bits of 512bit (ZMM) vector registers
1559   void vinserti64x4_low(XMMRegister dst, XMMRegister src) {
1560     Assembler::vinserti64x4(dst, dst, src, 0);
1561   }
1562   void vinsertf64x4_low(XMMRegister dst, XMMRegister src) {
1563     Assembler::vinsertf64x4(dst, dst, src, 0);
1564   }
1565   void vextracti64x4_low(XMMRegister dst, XMMRegister src) {
1566     Assembler::vextracti64x4(dst, src, 0);
1567   }
1568   void vextractf64x4_low(XMMRegister dst, XMMRegister src) {
1569     Assembler::vextractf64x4(dst, src, 0);
1570   }
1571   void vextractf64x4_low(Address dst, XMMRegister src) {
1572     Assembler::vextractf64x4(dst, src, 0);
1573   }
1574   void vinsertf64x4_low(XMMRegister dst, Address src) {
1575     Assembler::vinsertf64x4(dst, dst, src, 0);
1576   }
1577 
1578   // Carry-Less Multiplication Quadword
1579   void vpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1580     // 0x00 - multiply lower 64 bits [0:63]
1581     Assembler::vpclmulqdq(dst, nds, src, 0x00);
1582   }
1583   void vpclmulhdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1584     // 0x11 - multiply upper 64 bits [64:127]
1585     Assembler::vpclmulqdq(dst, nds, src, 0x11);
1586   }
1587   void vpclmullqhqdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1588     // 0x10 - multiply nds[0:63] and src[64:127]
1589     Assembler::vpclmulqdq(dst, nds, src, 0x10);
1590   }
1591   void vpclmulhqlqdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1592     //0x01 - multiply nds[64:127] and src[0:63]
1593     Assembler::vpclmulqdq(dst, nds, src, 0x01);
1594   }
1595 
1596   void evpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1597     // 0x00 - multiply lower 64 bits [0:63]
1598     Assembler::evpclmulqdq(dst, nds, src, 0x00, vector_len);
1599   }
1600   void evpclmulhdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1601     // 0x11 - multiply upper 64 bits [64:127]
1602     Assembler::evpclmulqdq(dst, nds, src, 0x11, vector_len);
1603   }
1604 
1605   // Data
1606 
1607   void cmov32( Condition cc, Register dst, Address  src);
1608   void cmov32( Condition cc, Register dst, Register src);
1609 
1610   void cmov(   Condition cc, Register dst, Register src) { cmovptr(cc, dst, src); }
1611 
1612   void cmovptr(Condition cc, Register dst, Address  src) { LP64_ONLY(cmovq(cc, dst, src)) NOT_LP64(cmov32(cc, dst, src)); }
1613   void cmovptr(Condition cc, Register dst, Register src) { LP64_ONLY(cmovq(cc, dst, src)) NOT_LP64(cmov32(cc, dst, src)); }
1614 
1615   void movoop(Register dst, jobject obj);
1616   void movoop(Address dst, jobject obj);
1617 
1618   void mov_metadata(Register dst, Metadata* obj);
1619   void mov_metadata(Address dst, Metadata* obj);
1620 
1621   void movptr(ArrayAddress dst, Register src);
1622   // can this do an lea?
1623   void movptr(Register dst, ArrayAddress src);
1624 
1625   void movptr(Register dst, Address src);
1626 
1627 #ifdef _LP64
1628   void movptr(Register dst, AddressLiteral src, Register scratch=rscratch1);
1629 #else
1630   void movptr(Register dst, AddressLiteral src, Register scratch=noreg); // Scratch reg is ignored in 32-bit
1631 #endif
1632 
1633   void movptr(Register dst, intptr_t src);
1634   void movptr(Register dst, Register src);
1635   void movptr(Address dst, intptr_t src);
1636 
1637   void movptr(Address dst, Register src);
1638 
1639   void movptr(Register dst, RegisterOrConstant src) {
1640     if (src.is_constant()) movptr(dst, src.as_constant());
1641     else                   movptr(dst, src.as_register());
1642   }
1643 
1644 #ifdef _LP64
1645   // Generally the next two are only used for moving NULL
1646   // Although there are situations in initializing the mark word where
1647   // they could be used. They are dangerous.
1648 
1649   // They only exist on LP64 so that int32_t and intptr_t are not the same
1650   // and we have ambiguous declarations.
1651 
1652   void movptr(Address dst, int32_t imm32);
1653   void movptr(Register dst, int32_t imm32);
1654 #endif // _LP64
1655 
1656   // to avoid hiding movl
1657   void mov32(AddressLiteral dst, Register src);
1658   void mov32(Register dst, AddressLiteral src);
1659 
1660   // to avoid hiding movb
1661   void movbyte(ArrayAddress dst, int src);
1662 
1663   // Import other mov() methods from the parent class or else
1664   // they will be hidden by the following overriding declaration.
1665   using Assembler::movdl;
1666   using Assembler::movq;
1667   void movdl(XMMRegister dst, AddressLiteral src);
1668   void movq(XMMRegister dst, AddressLiteral src);
1669 
1670   // Can push value or effective address
1671   void pushptr(AddressLiteral src);
1672 
1673   void pushptr(Address src) { LP64_ONLY(pushq(src)) NOT_LP64(pushl(src)); }
1674   void popptr(Address src) { LP64_ONLY(popq(src)) NOT_LP64(popl(src)); }
1675 
1676   void pushoop(jobject obj);
1677   void pushklass(Metadata* obj);
1678 
1679   // sign extend as need a l to ptr sized element
1680   void movl2ptr(Register dst, Address src) { LP64_ONLY(movslq(dst, src)) NOT_LP64(movl(dst, src)); }
1681   void movl2ptr(Register dst, Register src) { LP64_ONLY(movslq(dst, src)) NOT_LP64(if (dst != src) movl(dst, src)); }
1682 
1683 #ifdef COMPILER2
1684   // Generic instructions support for use in .ad files C2 code generation
1685   void vabsnegd(int opcode, XMMRegister dst, Register scr);
1686   void vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr);
1687   void vabsnegf(int opcode, XMMRegister dst, Register scr);
1688   void vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr);
1689   void vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len);
1690   void vextendbw(bool sign, XMMRegister dst, XMMRegister src);
1691   void vshiftd(int opcode, XMMRegister dst, XMMRegister src);
1692   void vshiftd(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1693   void vshiftw(int opcode, XMMRegister dst, XMMRegister src);
1694   void vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1695   void vshiftq(int opcode, XMMRegister dst, XMMRegister src);
1696   void vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1697 #endif
1698 
1699   // C2 compiled method's prolog code.
1700   void verified_entry(Compile* C, int sp_inc = 0);
1701 
1702   enum RegState {
1703     reg_readonly,
1704     reg_writable,
1705     reg_written
1706   };
1707 
1708   int store_value_type_fields_to_buf(ciValueKlass* vk, bool from_interpreter = true);
1709 
1710   // Unpack all value type arguments passed as oops
1711   void unpack_value_args(Compile* C, bool receiver_only);
1712   bool move_helper(VMReg from, VMReg to, BasicType bt, RegState reg_state[], int ret_off, int extra_stack_offset);
1713   bool unpack_value_helper(const GrowableArray<SigEntry>* sig, int& sig_index, VMReg from, VMRegPair* regs_to, int& to_index,
1714                            RegState reg_state[], int ret_off, int extra_stack_offset);
1715   bool pack_value_helper(const GrowableArray<SigEntry>* sig, int& sig_index, int vtarg_index,
1716                          VMReg to, VMRegPair* regs_from, int regs_from_count, int& from_index, RegState reg_state[],
1717                          int ret_off, int extra_stack_offset);
1718   void restore_stack(Compile* C);
1719 
1720   int shuffle_value_args(bool is_packing, bool receiver_only, int extra_stack_offset,
1721                          BasicType* sig_bt, const GrowableArray<SigEntry>* sig_cc,
1722                          int args_passed, int args_on_stack, VMRegPair* regs,
1723                          int args_passed_to, int args_on_stack_to, VMRegPair* regs_to);
1724   bool shuffle_value_args_spill(bool is_packing,  const GrowableArray<SigEntry>* sig_cc, int sig_cc_index,
1725                                 VMRegPair* regs_from, int from_index, int regs_from_count,
1726                                 RegState* reg_state, int sp_inc, int extra_stack_offset);
1727   VMReg spill_reg_for(VMReg reg);
1728 
1729   // clear memory of size 'cnt' qwords, starting at 'base';
1730   // if 'is_large' is set, do not try to produce short loop
1731   void clear_mem(Register base, Register cnt, Register val, XMMRegister xtmp, bool is_large, bool word_copy_only);
1732 
1733   // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers
1734   void xmm_clear_mem(Register base, Register cnt, Register val, XMMRegister xtmp);
1735 
1736 #ifdef COMPILER2
1737   void string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
1738                            XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp);
1739 
1740   // IndexOf strings.
1741   // Small strings are loaded through stack if they cross page boundary.
1742   void string_indexof(Register str1, Register str2,
1743                       Register cnt1, Register cnt2,
1744                       int int_cnt2,  Register result,
1745                       XMMRegister vec, Register tmp,
1746                       int ae);
1747 
1748   // IndexOf for constant substrings with size >= 8 elements
1749   // which don't need to be loaded through stack.
1750   void string_indexofC8(Register str1, Register str2,
1751                       Register cnt1, Register cnt2,
1752                       int int_cnt2,  Register result,
1753                       XMMRegister vec, Register tmp,
1754                       int ae);
1755 
1756     // Smallest code: we don't need to load through stack,
1757     // check string tail.
1758 
1759   // helper function for string_compare
1760   void load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
1761                           Address::ScaleFactor scale, Address::ScaleFactor scale1,
1762                           Address::ScaleFactor scale2, Register index, int ae);
1763   // Compare strings.
1764   void string_compare(Register str1, Register str2,
1765                       Register cnt1, Register cnt2, Register result,
1766                       XMMRegister vec1, int ae);
1767 
1768   // Search for Non-ASCII character (Negative byte value) in a byte array,
1769   // return true if it has any and false otherwise.
1770   void has_negatives(Register ary1, Register len,
1771                      Register result, Register tmp1,
1772                      XMMRegister vec1, XMMRegister vec2);
1773 
1774   // Compare char[] or byte[] arrays.
1775   void arrays_equals(bool is_array_equ, Register ary1, Register ary2,
1776                      Register limit, Register result, Register chr,
1777                      XMMRegister vec1, XMMRegister vec2, bool is_char);
1778 
1779 #endif
1780 
1781   // Fill primitive arrays
1782   void generate_fill(BasicType t, bool aligned,
1783                      Register to, Register value, Register count,
1784                      Register rtmp, XMMRegister xtmp);
1785 
1786   void encode_iso_array(Register src, Register dst, Register len,
1787                         XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3,
1788                         XMMRegister tmp4, Register tmp5, Register result);
1789 
1790 #ifdef _LP64
1791   void add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2);
1792   void multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
1793                              Register y, Register y_idx, Register z,
1794                              Register carry, Register product,
1795                              Register idx, Register kdx);
1796   void multiply_add_128_x_128(Register x_xstart, Register y, Register z,
1797                               Register yz_idx, Register idx,
1798                               Register carry, Register product, int offset);
1799   void multiply_128_x_128_bmi2_loop(Register y, Register z,
1800                                     Register carry, Register carry2,
1801                                     Register idx, Register jdx,
1802                                     Register yz_idx1, Register yz_idx2,
1803                                     Register tmp, Register tmp3, Register tmp4);
1804   void multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
1805                                Register yz_idx, Register idx, Register jdx,
1806                                Register carry, Register product,
1807                                Register carry2);
1808   void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
1809                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5);
1810   void square_rshift(Register x, Register len, Register z, Register tmp1, Register tmp3,
1811                      Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
1812   void multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry,
1813                             Register tmp2);
1814   void multiply_add_64(Register sum, Register op1, Register op2, Register carry,
1815                        Register rdxReg, Register raxReg);
1816   void add_one_64(Register z, Register zlen, Register carry, Register tmp1);
1817   void lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
1818                        Register tmp3, Register tmp4);
1819   void square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
1820                      Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
1821 
1822   void mul_add_128_x_32_loop(Register out, Register in, Register offset, Register len, Register tmp1,
1823                Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
1824                Register raxReg);
1825   void mul_add(Register out, Register in, Register offset, Register len, Register k, Register tmp1,
1826                Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
1827                Register raxReg);
1828   void vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
1829                            Register result, Register tmp1, Register tmp2,
1830                            XMMRegister vec1, XMMRegister vec2, XMMRegister vec3);
1831 #endif
1832 
1833   // CRC32 code for java.util.zip.CRC32::updateBytes() intrinsic.
1834   void update_byte_crc32(Register crc, Register val, Register table);
1835   void kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp);
1836   // CRC32C code for java.util.zip.CRC32C::updateBytes() intrinsic
1837   // Note on a naming convention:
1838   // Prefix w = register only used on a Westmere+ architecture
1839   // Prefix n = register only used on a Nehalem architecture
1840 #ifdef _LP64
1841   void crc32c_ipl_alg4(Register in_out, uint32_t n,
1842                        Register tmp1, Register tmp2, Register tmp3);
1843 #else
1844   void crc32c_ipl_alg4(Register in_out, uint32_t n,
1845                        Register tmp1, Register tmp2, Register tmp3,
1846                        XMMRegister xtmp1, XMMRegister xtmp2);
1847 #endif
1848   void crc32c_pclmulqdq(XMMRegister w_xtmp1,
1849                         Register in_out,
1850                         uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
1851                         XMMRegister w_xtmp2,
1852                         Register tmp1,
1853                         Register n_tmp2, Register n_tmp3);
1854   void crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
1855                        XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
1856                        Register tmp1, Register tmp2,
1857                        Register n_tmp3);
1858   void crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
1859                          Register in_out1, Register in_out2, Register in_out3,
1860                          Register tmp1, Register tmp2, Register tmp3,
1861                          XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
1862                          Register tmp4, Register tmp5,
1863                          Register n_tmp6);
1864   void crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
1865                             Register tmp1, Register tmp2, Register tmp3,
1866                             Register tmp4, Register tmp5, Register tmp6,
1867                             XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
1868                             bool is_pclmulqdq_supported);
1869   // Fold 128-bit data chunk
1870   void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset);
1871   void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf);
1872   // Fold 8-bit data
1873   void fold_8bit_crc32(Register crc, Register table, Register tmp);
1874   void fold_8bit_crc32(XMMRegister crc, Register table, XMMRegister xtmp, Register tmp);
1875   void fold_128bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset);
1876 
1877   // Compress char[] array to byte[].
1878   void char_array_compress(Register src, Register dst, Register len,
1879                            XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3,
1880                            XMMRegister tmp4, Register tmp5, Register result);
1881 
1882   // Inflate byte[] array to char[].
1883   void byte_array_inflate(Register src, Register dst, Register len,
1884                           XMMRegister tmp1, Register tmp2);
1885 
1886 #ifdef _LP64
1887   void cache_wb(Address line);
1888   void cache_wbsync(bool is_pre);
1889 #endif // _LP64
1890 
1891   #include "asm/macroAssembler_common.hpp"
1892 
1893 };
1894 
1895 /**
1896  * class SkipIfEqual:
1897  *
1898  * Instantiating this class will result in assembly code being output that will
1899  * jump around any code emitted between the creation of the instance and it's
1900  * automatic destruction at the end of a scope block, depending on the value of
1901  * the flag passed to the constructor, which will be checked at run-time.
1902  */
1903 class SkipIfEqual {
1904  private:
1905   MacroAssembler* _masm;
1906   Label _label;
1907 
1908  public:
1909    SkipIfEqual(MacroAssembler*, const bool* flag_addr, bool value);
1910    ~SkipIfEqual();
1911 };
1912 
1913 #endif // CPU_X86_MACROASSEMBLER_X86_HPP