New src/cpu/x86/vm/x86

   1 //
   2 // Copyright 1997-2009 Sun Microsystems, Inc.  All Rights Reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  20 // CA 95054 USA or visit www.sun.com if you need additional information or
  21 // have any questions.
  22 //
  23 //
  24 
  25 // X86 Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // General Registers
  63 // Previously set EBX, ESI, and EDI as save-on-entry for java code
  64 // Turn off SOE in java-code due to frequent use of uncommon-traps.
  65 // Now that allocator is better, turn on ESI and EDI as SOE registers.
  66 
  67 reg_def EBX(SOC, SOE, Op_RegI, 3, rbx->as_VMReg());
  68 reg_def ECX(SOC, SOC, Op_RegI, 1, rcx->as_VMReg());
  69 reg_def ESI(SOC, SOE, Op_RegI, 6, rsi->as_VMReg());
  70 reg_def EDI(SOC, SOE, Op_RegI, 7, rdi->as_VMReg());
  71 // now that adapter frames are gone EBP is always saved and restored by the prolog/epilog code
  72 reg_def EBP(NS, SOE, Op_RegI, 5, rbp->as_VMReg());
  73 reg_def EDX(SOC, SOC, Op_RegI, 2, rdx->as_VMReg());
  74 reg_def EAX(SOC, SOC, Op_RegI, 0, rax->as_VMReg());
  75 reg_def ESP( NS,  NS, Op_RegI, 4, rsp->as_VMReg());
  76 
  77 // Special Registers
  78 reg_def EFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
  79 
  80 // Float registers.  We treat TOS/FPR0 special.  It is invisible to the
  81 // allocator, and only shows up in the encodings.
  82 reg_def FPR0L( SOC, SOC, Op_RegF, 0, VMRegImpl::Bad());
  83 reg_def FPR0H( SOC, SOC, Op_RegF, 0, VMRegImpl::Bad());
  84 // Ok so here's the trick FPR1 is really st(0) except in the midst
  85 // of emission of assembly for a machnode. During the emission the fpu stack
  86 // is pushed making FPR1 == st(1) temporarily. However at any safepoint
  87 // the stack will not have this element so FPR1 == st(0) from the
  88 // oopMap viewpoint. This same weirdness with numbering causes
  89 // instruction encoding to have to play games with the register
  90 // encode to correct for this 0/1 issue. See MachSpillCopyNode::implementation
  91 // where it does flt->flt moves to see an example
  92 //
  93 reg_def FPR1L( SOC, SOC, Op_RegF, 1, as_FloatRegister(0)->as_VMReg());
  94 reg_def FPR1H( SOC, SOC, Op_RegF, 1, as_FloatRegister(0)->as_VMReg()->next());
  95 reg_def FPR2L( SOC, SOC, Op_RegF, 2, as_FloatRegister(1)->as_VMReg());
  96 reg_def FPR2H( SOC, SOC, Op_RegF, 2, as_FloatRegister(1)->as_VMReg()->next());
  97 reg_def FPR3L( SOC, SOC, Op_RegF, 3, as_FloatRegister(2)->as_VMReg());
  98 reg_def FPR3H( SOC, SOC, Op_RegF, 3, as_FloatRegister(2)->as_VMReg()->next());
  99 reg_def FPR4L( SOC, SOC, Op_RegF, 4, as_FloatRegister(3)->as_VMReg());
 100 reg_def FPR4H( SOC, SOC, Op_RegF, 4, as_FloatRegister(3)->as_VMReg()->next());
 101 reg_def FPR5L( SOC, SOC, Op_RegF, 5, as_FloatRegister(4)->as_VMReg());
 102 reg_def FPR5H( SOC, SOC, Op_RegF, 5, as_FloatRegister(4)->as_VMReg()->next());
 103 reg_def FPR6L( SOC, SOC, Op_RegF, 6, as_FloatRegister(5)->as_VMReg());
 104 reg_def FPR6H( SOC, SOC, Op_RegF, 6, as_FloatRegister(5)->as_VMReg()->next());
 105 reg_def FPR7L( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg());
 106 reg_def FPR7H( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next());
 107 
 108 // XMM registers.  128-bit registers or 4 words each, labeled a-d.
 109 // Word a in each register holds a Float, words ab hold a Double.
 110 // We currently do not use the SIMD capabilities, so registers cd
 111 // are unused at the moment.
 112 reg_def XMM0a( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
 113 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next());
 114 reg_def XMM1a( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
 115 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next());
 116 reg_def XMM2a( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 117 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next());
 118 reg_def XMM3a( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 119 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next());
 120 reg_def XMM4a( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 121 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next());
 122 reg_def XMM5a( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 123 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next());
 124 reg_def XMM6a( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 125 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next());
 126 reg_def XMM7a( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 127 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next());
 128 
 129 // Specify priority of register selection within phases of register
 130 // allocation.  Highest priority is first.  A useful heuristic is to
 131 // give registers a low priority when they are required by machine
 132 // instructions, like EAX and EDX.  Registers which are used as
 133 // pairs must fall on an even boundary (witness the FPR#L's in this list).
 134 // For the Intel integer registers, the equivalent Long pairs are
 135 // EDX:EAX, EBX:ECX, and EDI:EBP.
 136 alloc_class chunk0( ECX,   EBX,   EBP,   EDI,   EAX,   EDX,   ESI, ESP,
 137                     FPR0L, FPR0H, FPR1L, FPR1H, FPR2L, FPR2H,
 138                     FPR3L, FPR3H, FPR4L, FPR4H, FPR5L, FPR5H,
 139                     FPR6L, FPR6H, FPR7L, FPR7H );
 140 
 141 alloc_class chunk1( XMM0a, XMM0b,
 142                     XMM1a, XMM1b,
 143                     XMM2a, XMM2b,
 144                     XMM3a, XMM3b,
 145                     XMM4a, XMM4b,
 146                     XMM5a, XMM5b,
 147                     XMM6a, XMM6b,
 148                     XMM7a, XMM7b, EFLAGS);
 149 
 150 
 151 //----------Architecture Description Register Classes--------------------------
 152 // Several register classes are automatically defined based upon information in
 153 // this architecture description.
 154 // 1) reg_class inline_cache_reg           ( /* as def'd in frame section */ )
 155 // 2) reg_class compiler_method_oop_reg    ( /* as def'd in frame section */ )
 156 // 2) reg_class interpreter_method_oop_reg ( /* as def'd in frame section */ )
 157 // 3) reg_class stack_slots( /* one chunk of stack-based "registers" */ )
 158 //
 159 // Class for all registers
 160 reg_class any_reg(EAX, EDX, EBP, EDI, ESI, ECX, EBX, ESP);
 161 // Class for general registers
 162 reg_class e_reg(EAX, EDX, EBP, EDI, ESI, ECX, EBX);
 163 // Class for general registers which may be used for implicit null checks on win95
 164 // Also safe for use by tailjump. We don't want to allocate in rbp,
 165 reg_class e_reg_no_rbp(EAX, EDX, EDI, ESI, ECX, EBX);
 166 // Class of "X" registers
 167 reg_class x_reg(EBX, ECX, EDX, EAX);
 168 // Class of registers that can appear in an address with no offset.
 169 // EBP and ESP require an extra instruction byte for zero offset.
 170 // Used in fast-unlock
 171 reg_class p_reg(EDX, EDI, ESI, EBX);
 172 // Class for general registers not including ECX
 173 reg_class ncx_reg(EAX, EDX, EBP, EDI, ESI, EBX);
 174 // Class for general registers not including EAX
 175 reg_class nax_reg(EDX, EDI, ESI, ECX, EBX);
 176 // Class for general registers not including EAX or EBX.
 177 reg_class nabx_reg(EDX, EDI, ESI, ECX, EBP);
 178 // Class of EAX (for multiply and divide operations)
 179 reg_class eax_reg(EAX);
 180 // Class of EBX (for atomic add)
 181 reg_class ebx_reg(EBX);
 182 // Class of ECX (for shift and JCXZ operations and cmpLTMask)
 183 reg_class ecx_reg(ECX);
 184 // Class of EDX (for multiply and divide operations)
 185 reg_class edx_reg(EDX);
 186 // Class of EDI (for synchronization)
 187 reg_class edi_reg(EDI);
 188 // Class of ESI (for synchronization)
 189 reg_class esi_reg(ESI);
 190 // Singleton class for interpreter's stack pointer
 191 reg_class ebp_reg(EBP);
 192 // Singleton class for stack pointer
 193 reg_class sp_reg(ESP);
 194 // Singleton class for instruction pointer
 195 // reg_class ip_reg(EIP);
 196 // Singleton class for condition codes
 197 reg_class int_flags(EFLAGS);
 198 // Class of integer register pairs
 199 reg_class long_reg( EAX,EDX, ECX,EBX, EBP,EDI );
 200 // Class of integer register pairs that aligns with calling convention
 201 reg_class eadx_reg( EAX,EDX );
 202 reg_class ebcx_reg( ECX,EBX );
 203 // Not AX or DX, used in divides
 204 reg_class nadx_reg( EBX,ECX,ESI,EDI,EBP );
 205 
 206 // Floating point registers.  Notice FPR0 is not a choice.
 207 // FPR0 is not ever allocated; we use clever encodings to fake
 208 // a 2-address instructions out of Intels FP stack.
 209 reg_class flt_reg( FPR1L,FPR2L,FPR3L,FPR4L,FPR5L,FPR6L,FPR7L );
 210 
 211 // make a register class for SSE registers
 212 reg_class xmm_reg(XMM0a, XMM1a, XMM2a, XMM3a, XMM4a, XMM5a, XMM6a, XMM7a);
 213 
 214 // make a double register class for SSE2 registers
 215 reg_class xdb_reg(XMM0a,XMM0b, XMM1a,XMM1b, XMM2a,XMM2b, XMM3a,XMM3b,
 216                   XMM4a,XMM4b, XMM5a,XMM5b, XMM6a,XMM6b, XMM7a,XMM7b );
 217 
 218 reg_class dbl_reg( FPR1L,FPR1H, FPR2L,FPR2H, FPR3L,FPR3H,
 219                    FPR4L,FPR4H, FPR5L,FPR5H, FPR6L,FPR6H,
 220                    FPR7L,FPR7H );
 221 
 222 reg_class flt_reg0( FPR1L );
 223 reg_class dbl_reg0( FPR1L,FPR1H );
 224 reg_class dbl_reg1( FPR2L,FPR2H );
 225 reg_class dbl_notreg0( FPR2L,FPR2H, FPR3L,FPR3H, FPR4L,FPR4H,
 226                        FPR5L,FPR5H, FPR6L,FPR6H, FPR7L,FPR7H );
 227 
 228 // XMM6 and XMM7 could be used as temporary registers for long, float and
 229 // double values for SSE2.
 230 reg_class xdb_reg6( XMM6a,XMM6b );
 231 reg_class xdb_reg7( XMM7a,XMM7b );
 232 %}
 233 
 234 
 235 //----------SOURCE BLOCK-------------------------------------------------------
 236 // This is a block of C++ code which provides values, functions, and
 237 // definitions necessary in the rest of the architecture description
 238 source_hpp %{
 239 // Must be visible to the DFA in dfa_x86_32.cpp
 240 extern bool is_mulL_operand_hi32_zero(MulLNode* n, bool isLeft);
 241 %}
 242 
 243 source %{
 244 #define   RELOC_IMM32    Assembler::imm_operand
 245 #define   RELOC_DISP32   Assembler::disp32_operand
 246 
 247 #define __ _masm.
 248 
 249 // How to find the high register of a Long pair, given the low register
 250 #define   HIGH_FROM_LOW(x) ((x)+2)
 251 
 252 // These masks are used to provide 128-bit aligned bitmasks to the XMM
 253 // instructions, to allow sign-masking or sign-bit flipping.  They allow
 254 // fast versions of NegF/NegD and AbsF/AbsD.
 255 
 256 // Note: 'double' and 'long long' have 32-bits alignment on x86.
 257 static jlong* double_quadword(jlong *adr, jlong lo, jlong hi) {
 258   // Use the expression (adr)&(~0xF) to provide 128-bits aligned address
 259   // of 128-bits operands for SSE instructions.
 260   jlong *operand = (jlong*)(((uintptr_t)adr)&((uintptr_t)(~0xF)));
 261   // Store the value to a 128-bits operand.
 262   operand[0] = lo;
 263   operand[1] = hi;
 264   return operand;
 265 }
 266 
 267 // Buffer for 128-bits masks used by SSE instructions.
 268 static jlong fp_signmask_pool[(4+1)*2]; // 4*128bits(data) + 128bits(alignment)
 269 
 270 // Static initialization during VM startup.
 271 static jlong *float_signmask_pool  = double_quadword(&fp_signmask_pool[1*2], CONST64(0x7FFFFFFF7FFFFFFF), CONST64(0x7FFFFFFF7FFFFFFF));
 272 static jlong *double_signmask_pool = double_quadword(&fp_signmask_pool[2*2], CONST64(0x7FFFFFFFFFFFFFFF), CONST64(0x7FFFFFFFFFFFFFFF));
 273 static jlong *float_signflip_pool  = double_quadword(&fp_signmask_pool[3*2], CONST64(0x8000000080000000), CONST64(0x8000000080000000));
 274 static jlong *double_signflip_pool = double_quadword(&fp_signmask_pool[4*2], CONST64(0x8000000000000000), CONST64(0x8000000000000000));
 275 
 276 // Offset hacking within calls.
 277 static int pre_call_FPU_size() {
 278   if (Compile::current()->in_24_bit_fp_mode())
 279     return 6; // fldcw
 280   return 0;
 281 }
 282 
 283 static int preserve_SP_size() {
 284   return LP64_ONLY(1 +) 2;  // [rex,] op, rm(reg/reg)
 285 }
 286 
 287 // !!!!! Special hack to get all type of calls to specify the byte offset
 288 //       from the start of the call to the point where the return address
 289 //       will point.
 290 int MachCallStaticJavaNode::ret_addr_offset() {
 291   int offset = 5 + pre_call_FPU_size();  // 5 bytes from start of call to where return address points
 292   if (_method_handle_invoke)
 293     offset += preserve_SP_size();
 294   return offset;
 295 }
 296 
 297 int MachCallDynamicJavaNode::ret_addr_offset() {
 298   return 10 + pre_call_FPU_size();  // 10 bytes from start of call to where return address points
 299 }
 300 
 301 static int sizeof_FFree_Float_Stack_All = -1;
 302 
 303 int MachCallRuntimeNode::ret_addr_offset() {
 304   assert(sizeof_FFree_Float_Stack_All != -1, "must have been emitted already");
 305   return sizeof_FFree_Float_Stack_All + 5 + pre_call_FPU_size();
 306 }
 307 
 308 // Indicate if the safepoint node needs the polling page as an input.
 309 // Since x86 does have absolute addressing, it doesn't.
 310 bool SafePointNode::needs_polling_address_input() {
 311   return false;
 312 }
 313 
 314 //
 315 // Compute padding required for nodes which need alignment
 316 //
 317 
 318 // The address of the call instruction needs to be 4-byte aligned to
 319 // ensure that it does not span a cache line so that it can be patched.
 320 int CallStaticJavaDirectNode::compute_padding(int current_offset) const {
 321   current_offset += pre_call_FPU_size();  // skip fldcw, if any
 322   current_offset += 1;      // skip call opcode byte
 323   return round_to(current_offset, alignment_required()) - current_offset;
 324 }
 325 
 326 // The address of the call instruction needs to be 4-byte aligned to
 327 // ensure that it does not span a cache line so that it can be patched.
 328 int CallStaticJavaHandleNode::compute_padding(int current_offset) const {
 329   current_offset += pre_call_FPU_size();  // skip fldcw, if any
 330   current_offset += preserve_SP_size();   // skip mov rbp, rsp
 331   current_offset += 1;      // skip call opcode byte
 332   return round_to(current_offset, alignment_required()) - current_offset;
 333 }
 334 
 335 // The address of the call instruction needs to be 4-byte aligned to
 336 // ensure that it does not span a cache line so that it can be patched.
 337 int CallDynamicJavaDirectNode::compute_padding(int current_offset) const {
 338   current_offset += pre_call_FPU_size();  // skip fldcw, if any
 339   current_offset += 5;      // skip MOV instruction
 340   current_offset += 1;      // skip call opcode byte
 341   return round_to(current_offset, alignment_required()) - current_offset;
 342 }
 343 
 344 #ifndef PRODUCT
 345 void MachBreakpointNode::format( PhaseRegAlloc *, outputStream* st ) const {
 346   st->print("INT3");
 347 }
 348 #endif
 349 
 350 // EMIT_RM()
 351 void emit_rm(CodeBuffer &cbuf, int f1, int f2, int f3) {
 352   unsigned char c = (unsigned char)((f1 << 6) | (f2 << 3) | f3);
 353   *(cbuf.code_end()) = c;
 354   cbuf.set_code_end(cbuf.code_end() + 1);
 355 }
 356 
 357 // EMIT_CC()
 358 void emit_cc(CodeBuffer &cbuf, int f1, int f2) {
 359   unsigned char c = (unsigned char)( f1 | f2 );
 360   *(cbuf.code_end()) = c;
 361   cbuf.set_code_end(cbuf.code_end() + 1);
 362 }
 363 
 364 // EMIT_OPCODE()
 365 void emit_opcode(CodeBuffer &cbuf, int code) {
 366   *(cbuf.code_end()) = (unsigned char)code;
 367   cbuf.set_code_end(cbuf.code_end() + 1);
 368 }
 369 
 370 // EMIT_OPCODE() w/ relocation information
 371 void emit_opcode(CodeBuffer &cbuf, int code, relocInfo::relocType reloc, int offset = 0) {
 372   cbuf.relocate(cbuf.inst_mark() + offset, reloc);
 373   emit_opcode(cbuf, code);
 374 }
 375 
 376 // EMIT_D8()
 377 void emit_d8(CodeBuffer &cbuf, int d8) {
 378   *(cbuf.code_end()) = (unsigned char)d8;
 379   cbuf.set_code_end(cbuf.code_end() + 1);
 380 }
 381 
 382 // EMIT_D16()
 383 void emit_d16(CodeBuffer &cbuf, int d16) {
 384   *((short *)(cbuf.code_end())) = d16;
 385   cbuf.set_code_end(cbuf.code_end() + 2);
 386 }
 387 
 388 // EMIT_D32()
 389 void emit_d32(CodeBuffer &cbuf, int d32) {
 390   *((int *)(cbuf.code_end())) = d32;
 391   cbuf.set_code_end(cbuf.code_end() + 4);
 392 }
 393 
 394 // emit 32 bit value and construct relocation entry from relocInfo::relocType
 395 void emit_d32_reloc(CodeBuffer &cbuf, int d32, relocInfo::relocType reloc,
 396         int format) {
 397   cbuf.relocate(cbuf.inst_mark(), reloc, format);
 398 
 399   *((int *)(cbuf.code_end())) = d32;
 400   cbuf.set_code_end(cbuf.code_end() + 4);
 401 }
 402 
 403 // emit 32 bit value and construct relocation entry from RelocationHolder
 404 void emit_d32_reloc(CodeBuffer &cbuf, int d32, RelocationHolder const& rspec,
 405         int format) {
 406 #ifdef ASSERT
 407   if (rspec.reloc()->type() == relocInfo::oop_type && d32 != 0 && d32 != (int)Universe::non_oop_word()) {
 408     assert(oop(d32)->is_oop() && (ScavengeRootsInCode || !oop(d32)->is_scavengable()), "cannot embed scavengable oops in code");
 409   }
 410 #endif
 411   cbuf.relocate(cbuf.inst_mark(), rspec, format);
 412 
 413   *((int *)(cbuf.code_end())) = d32;
 414   cbuf.set_code_end(cbuf.code_end() + 4);
 415 }
 416 
 417 // Access stack slot for load or store
 418 void store_to_stackslot(CodeBuffer &cbuf, int opcode, int rm_field, int disp) {
 419   emit_opcode( cbuf, opcode );               // (e.g., FILD   [ESP+src])
 420   if( -128 <= disp && disp <= 127 ) {
 421     emit_rm( cbuf, 0x01, rm_field, ESP_enc );  // R/M byte
 422     emit_rm( cbuf, 0x00, ESP_enc, ESP_enc);    // SIB byte
 423     emit_d8 (cbuf, disp);     // Displacement  // R/M byte
 424   } else {
 425     emit_rm( cbuf, 0x02, rm_field, ESP_enc );  // R/M byte
 426     emit_rm( cbuf, 0x00, ESP_enc, ESP_enc);    // SIB byte
 427     emit_d32(cbuf, disp);     // Displacement  // R/M byte
 428   }
 429 }
 430 
 431    // eRegI ereg, memory mem) %{    // emit_reg_mem
 432 void encode_RegMem( CodeBuffer &cbuf, int reg_encoding, int base, int index, int scale, int displace, bool displace_is_oop ) {
 433   // There is no index & no scale, use form without SIB byte
 434   if ((index == 0x4) &&
 435       (scale == 0) && (base != ESP_enc)) {
 436     // If no displacement, mode is 0x0; unless base is [EBP]
 437     if ( (displace == 0) && (base != EBP_enc) ) {
 438       emit_rm(cbuf, 0x0, reg_encoding, base);
 439     }
 440     else {                    // If 8-bit displacement, mode 0x1
 441       if ((displace >= -128) && (displace <= 127)
 442           && !(displace_is_oop) ) {
 443         emit_rm(cbuf, 0x1, reg_encoding, base);
 444         emit_d8(cbuf, displace);
 445       }
 446       else {                  // If 32-bit displacement
 447         if (base == -1) { // Special flag for absolute address
 448           emit_rm(cbuf, 0x0, reg_encoding, 0x5);
 449           // (manual lies; no SIB needed here)
 450           if ( displace_is_oop ) {
 451             emit_d32_reloc(cbuf, displace, relocInfo::oop_type, 1);
 452           } else {
 453             emit_d32      (cbuf, displace);
 454           }
 455         }
 456         else {                // Normal base + offset
 457           emit_rm(cbuf, 0x2, reg_encoding, base);
 458           if ( displace_is_oop ) {
 459             emit_d32_reloc(cbuf, displace, relocInfo::oop_type, 1);
 460           } else {
 461             emit_d32      (cbuf, displace);
 462           }
 463         }
 464       }
 465     }
 466   }
 467   else {                      // Else, encode with the SIB byte
 468     // If no displacement, mode is 0x0; unless base is [EBP]
 469     if (displace == 0 && (base != EBP_enc)) {  // If no displacement
 470       emit_rm(cbuf, 0x0, reg_encoding, 0x4);
 471       emit_rm(cbuf, scale, index, base);
 472     }
 473     else {                    // If 8-bit displacement, mode 0x1
 474       if ((displace >= -128) && (displace <= 127)
 475           && !(displace_is_oop) ) {
 476         emit_rm(cbuf, 0x1, reg_encoding, 0x4);
 477         emit_rm(cbuf, scale, index, base);
 478         emit_d8(cbuf, displace);
 479       }
 480       else {                  // If 32-bit displacement
 481         if (base == 0x04 ) {
 482           emit_rm(cbuf, 0x2, reg_encoding, 0x4);
 483           emit_rm(cbuf, scale, index, 0x04);
 484         } else {
 485           emit_rm(cbuf, 0x2, reg_encoding, 0x4);
 486           emit_rm(cbuf, scale, index, base);
 487         }
 488         if ( displace_is_oop ) {
 489           emit_d32_reloc(cbuf, displace, relocInfo::oop_type, 1);
 490         } else {
 491           emit_d32      (cbuf, displace);
 492         }
 493       }
 494     }
 495   }
 496 }
 497 
 498 
 499 void encode_Copy( CodeBuffer &cbuf, int dst_encoding, int src_encoding ) {
 500   if( dst_encoding == src_encoding ) {
 501     // reg-reg copy, use an empty encoding
 502   } else {
 503     emit_opcode( cbuf, 0x8B );
 504     emit_rm(cbuf, 0x3, dst_encoding, src_encoding );
 505   }
 506 }
 507 
 508 void encode_CopyXD( CodeBuffer &cbuf, int dst_encoding, int src_encoding ) {
 509   if( dst_encoding == src_encoding ) {
 510     // reg-reg copy, use an empty encoding
 511   } else {
 512     MacroAssembler _masm(&cbuf);
 513 
 514     __ movdqa(as_XMMRegister(dst_encoding), as_XMMRegister(src_encoding));
 515   }
 516 }
 517 
 518 
 519 //=============================================================================
 520 #ifndef PRODUCT
 521 void MachPrologNode::format( PhaseRegAlloc *ra_, outputStream* st ) const {
 522   Compile* C = ra_->C;
 523   if( C->in_24_bit_fp_mode() ) {
 524     st->print("FLDCW  24 bit fpu control word");
 525     st->print_cr(""); st->print("\t");
 526   }
 527 
 528   int framesize = C->frame_slots() << LogBytesPerInt;
 529   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
 530   // Remove two words for return addr and rbp,
 531   framesize -= 2*wordSize;
 532 
 533   // Calls to C2R adapters often do not accept exceptional returns.
 534   // We require that their callers must bang for them.  But be careful, because
 535   // some VM calls (such as call site linkage) can use several kilobytes of
 536   // stack.  But the stack safety zone should account for that.
 537   // See bugs 4446381, 4468289, 4497237.
 538   if (C->need_stack_bang(framesize)) {
 539     st->print_cr("# stack bang"); st->print("\t");
 540   }
 541   st->print_cr("PUSHL  EBP"); st->print("\t");
 542 
 543   if( VerifyStackAtCalls ) { // Majik cookie to verify stack depth
 544     st->print("PUSH   0xBADB100D\t# Majik cookie for stack depth check");
 545     st->print_cr(""); st->print("\t");
 546     framesize -= wordSize;
 547   }
 548 
 549   if ((C->in_24_bit_fp_mode() || VerifyStackAtCalls ) && framesize < 128 ) {
 550     if (framesize) {
 551       st->print("SUB    ESP,%d\t# Create frame",framesize);
 552     }
 553   } else {
 554     st->print("SUB    ESP,%d\t# Create frame",framesize);
 555   }
 556 }
 557 #endif
 558 
 559 
 560 void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
 561   Compile* C = ra_->C;
 562 
 563   if (UseSSE >= 2 && VerifyFPU) {
 564     MacroAssembler masm(&cbuf);
 565     masm.verify_FPU(0, "FPU stack must be clean on entry");
 566   }
 567 
 568   // WARNING: Initial instruction MUST be 5 bytes or longer so that
 569   // NativeJump::patch_verified_entry will be able to patch out the entry
 570   // code safely. The fldcw is ok at 6 bytes, the push to verify stack
 571   // depth is ok at 5 bytes, the frame allocation can be either 3 or
 572   // 6 bytes. So if we don't do the fldcw or the push then we must
 573   // use the 6 byte frame allocation even if we have no frame. :-(
 574   // If method sets FPU control word do it now
 575   if( C->in_24_bit_fp_mode() ) {
 576     MacroAssembler masm(&cbuf);
 577     masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
 578   }
 579 
 580   int framesize = C->frame_slots() << LogBytesPerInt;
 581   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
 582   // Remove two words for return addr and rbp,
 583   framesize -= 2*wordSize;
 584 
 585   // Calls to C2R adapters often do not accept exceptional returns.
 586   // We require that their callers must bang for them.  But be careful, because
 587   // some VM calls (such as call site linkage) can use several kilobytes of
 588   // stack.  But the stack safety zone should account for that.
 589   // See bugs 4446381, 4468289, 4497237.
 590   if (C->need_stack_bang(framesize)) {
 591     MacroAssembler masm(&cbuf);
 592     masm.generate_stack_overflow_check(framesize);
 593   }
 594 
 595   // We always push rbp, so that on return to interpreter rbp, will be
 596   // restored correctly and we can correct the stack.
 597   emit_opcode(cbuf, 0x50 | EBP_enc);
 598 
 599   if( VerifyStackAtCalls ) { // Majik cookie to verify stack depth
 600     emit_opcode(cbuf, 0x68); // push 0xbadb100d
 601     emit_d32(cbuf, 0xbadb100d);
 602     framesize -= wordSize;
 603   }
 604 
 605   if ((C->in_24_bit_fp_mode() || VerifyStackAtCalls ) && framesize < 128 ) {
 606     if (framesize) {
 607       emit_opcode(cbuf, 0x83);   // sub  SP,#framesize
 608       emit_rm(cbuf, 0x3, 0x05, ESP_enc);
 609       emit_d8(cbuf, framesize);
 610     }
 611   } else {
 612     emit_opcode(cbuf, 0x81);   // sub  SP,#framesize
 613     emit_rm(cbuf, 0x3, 0x05, ESP_enc);
 614     emit_d32(cbuf, framesize);
 615   }
 616   C->set_frame_complete(cbuf.code_end() - cbuf.code_begin());
 617 
 618 #ifdef ASSERT
 619   if (VerifyStackAtCalls) {
 620     Label L;
 621     MacroAssembler masm(&cbuf);
 622     masm.push(rax);
 623     masm.mov(rax, rsp);
 624     masm.andptr(rax, StackAlignmentInBytes-1);
 625     masm.cmpptr(rax, StackAlignmentInBytes-wordSize);
 626     masm.pop(rax);
 627     masm.jcc(Assembler::equal, L);
 628     masm.stop("Stack is not properly aligned!");
 629     masm.bind(L);
 630   }
 631 #endif
 632 
 633 }
 634 
 635 uint MachPrologNode::size(PhaseRegAlloc *ra_) const {
 636   return MachNode::size(ra_); // too many variables; just compute it the hard way
 637 }
 638 
 639 int MachPrologNode::reloc() const {
 640   return 0; // a large enough number
 641 }
 642 
 643 //=============================================================================
 644 #ifndef PRODUCT
 645 void MachEpilogNode::format( PhaseRegAlloc *ra_, outputStream* st ) const {
 646   Compile *C = ra_->C;
 647   int framesize = C->frame_slots() << LogBytesPerInt;
 648   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
 649   // Remove two words for return addr and rbp,
 650   framesize -= 2*wordSize;
 651 
 652   if( C->in_24_bit_fp_mode() ) {
 653     st->print("FLDCW  standard control word");
 654     st->cr(); st->print("\t");
 655   }
 656   if( framesize ) {
 657     st->print("ADD    ESP,%d\t# Destroy frame",framesize);
 658     st->cr(); st->print("\t");
 659   }
 660   st->print_cr("POPL   EBP"); st->print("\t");
 661   if( do_polling() && C->is_method_compilation() ) {
 662     st->print("TEST   PollPage,EAX\t! Poll Safepoint");
 663     st->cr(); st->print("\t");
 664   }
 665 }
 666 #endif
 667 
 668 void MachEpilogNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
 669   Compile *C = ra_->C;
 670 
 671   // If method set FPU control word, restore to standard control word
 672   if( C->in_24_bit_fp_mode() ) {
 673     MacroAssembler masm(&cbuf);
 674     masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
 675   }
 676 
 677   int framesize = C->frame_slots() << LogBytesPerInt;
 678   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
 679   // Remove two words for return addr and rbp,
 680   framesize -= 2*wordSize;
 681 
 682   // Note that VerifyStackAtCalls' Majik cookie does not change the frame size popped here
 683 
 684   if( framesize >= 128 ) {
 685     emit_opcode(cbuf, 0x81); // add  SP, #framesize
 686     emit_rm(cbuf, 0x3, 0x00, ESP_enc);
 687     emit_d32(cbuf, framesize);
 688   }
 689   else if( framesize ) {
 690     emit_opcode(cbuf, 0x83); // add  SP, #framesize
 691     emit_rm(cbuf, 0x3, 0x00, ESP_enc);
 692     emit_d8(cbuf, framesize);
 693   }
 694 
 695   emit_opcode(cbuf, 0x58 | EBP_enc);
 696 
 697   if( do_polling() && C->is_method_compilation() ) {
 698     cbuf.relocate(cbuf.code_end(), relocInfo::poll_return_type, 0);
 699     emit_opcode(cbuf,0x85);
 700     emit_rm(cbuf, 0x0, EAX_enc, 0x5); // EAX
 701     emit_d32(cbuf, (intptr_t)os::get_polling_page());
 702   }
 703 }
 704 
 705 uint MachEpilogNode::size(PhaseRegAlloc *ra_) const {
 706   Compile *C = ra_->C;
 707   // If method set FPU control word, restore to standard control word
 708   int size = C->in_24_bit_fp_mode() ? 6 : 0;
 709   if( do_polling() && C->is_method_compilation() ) size += 6;
 710 
 711   int framesize = C->frame_slots() << LogBytesPerInt;
 712   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
 713   // Remove two words for return addr and rbp,
 714   framesize -= 2*wordSize;
 715 
 716   size++; // popl rbp,
 717 
 718   if( framesize >= 128 ) {
 719     size += 6;
 720   } else {
 721     size += framesize ? 3 : 0;
 722   }
 723   return size;
 724 }
 725 
 726 int MachEpilogNode::reloc() const {
 727   return 0; // a large enough number
 728 }
 729 
 730 const Pipeline * MachEpilogNode::pipeline() const {
 731   return MachNode::pipeline_class();
 732 }
 733 
 734 int MachEpilogNode::safepoint_offset() const { return 0; }
 735 
 736 //=============================================================================
 737 
 738 enum RC { rc_bad, rc_int, rc_float, rc_xmm, rc_stack };
 739 static enum RC rc_class( OptoReg::Name reg ) {
 740 
 741   if( !OptoReg::is_valid(reg)  ) return rc_bad;
 742   if (OptoReg::is_stack(reg)) return rc_stack;
 743 
 744   VMReg r = OptoReg::as_VMReg(reg);
 745   if (r->is_Register()) return rc_int;
 746   if (r->is_FloatRegister()) {
 747     assert(UseSSE < 2, "shouldn't be used in SSE2+ mode");
 748     return rc_float;
 749   }
 750   assert(r->is_XMMRegister(), "must be");
 751   return rc_xmm;
 752 }
 753 
 754 static int impl_helper( CodeBuffer *cbuf, bool do_size, bool is_load, int offset, int reg,
 755                         int opcode, const char *op_str, int size, outputStream* st ) {
 756   if( cbuf ) {
 757     emit_opcode  (*cbuf, opcode );
 758     encode_RegMem(*cbuf, Matcher::_regEncode[reg], ESP_enc, 0x4, 0, offset, false);
 759 #ifndef PRODUCT
 760   } else if( !do_size ) {
 761     if( size != 0 ) st->print("\n\t");
 762     if( opcode == 0x8B || opcode == 0x89 ) { // MOV
 763       if( is_load ) st->print("%s   %s,[ESP + #%d]",op_str,Matcher::regName[reg],offset);
 764       else          st->print("%s   [ESP + #%d],%s",op_str,offset,Matcher::regName[reg]);
 765     } else { // FLD, FST, PUSH, POP
 766       st->print("%s [ESP + #%d]",op_str,offset);
 767     }
 768 #endif
 769   }
 770   int offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4);
 771   return size+3+offset_size;
 772 }
 773 
 774 // Helper for XMM registers.  Extra opcode bits, limited syntax.
 775 static int impl_x_helper( CodeBuffer *cbuf, bool do_size, bool is_load,
 776                          int offset, int reg_lo, int reg_hi, int size, outputStream* st ) {
 777   if( cbuf ) {
 778     if( reg_lo+1 == reg_hi ) { // double move?
 779       if( is_load && !UseXmmLoadAndClearUpper )
 780         emit_opcode(*cbuf, 0x66 ); // use 'movlpd' for load
 781       else
 782         emit_opcode(*cbuf, 0xF2 ); // use 'movsd' otherwise
 783     } else {
 784       emit_opcode(*cbuf, 0xF3 );
 785     }
 786     emit_opcode(*cbuf, 0x0F );
 787     if( reg_lo+1 == reg_hi && is_load && !UseXmmLoadAndClearUpper )
 788       emit_opcode(*cbuf, 0x12 );   // use 'movlpd' for load
 789     else
 790       emit_opcode(*cbuf, is_load ? 0x10 : 0x11 );
 791     encode_RegMem(*cbuf, Matcher::_regEncode[reg_lo], ESP_enc, 0x4, 0, offset, false);
 792 #ifndef PRODUCT
 793   } else if( !do_size ) {
 794     if( size != 0 ) st->print("\n\t");
 795     if( reg_lo+1 == reg_hi ) { // double move?
 796       if( is_load ) st->print("%s %s,[ESP + #%d]",
 797                                UseXmmLoadAndClearUpper ? "MOVSD " : "MOVLPD",
 798                                Matcher::regName[reg_lo], offset);
 799       else          st->print("MOVSD  [ESP + #%d],%s",
 800                                offset, Matcher::regName[reg_lo]);
 801     } else {
 802       if( is_load ) st->print("MOVSS  %s,[ESP + #%d]",
 803                                Matcher::regName[reg_lo], offset);
 804       else          st->print("MOVSS  [ESP + #%d],%s",
 805                                offset, Matcher::regName[reg_lo]);
 806     }
 807 #endif
 808   }
 809   int offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4);
 810   return size+5+offset_size;
 811 }
 812 
 813 
 814 static int impl_movx_helper( CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
 815                             int src_hi, int dst_hi, int size, outputStream* st ) {
 816   if( UseXmmRegToRegMoveAll ) {//Use movaps,movapd to move between xmm registers
 817     if( cbuf ) {
 818       if( (src_lo+1 == src_hi && dst_lo+1 == dst_hi) ) {
 819         emit_opcode(*cbuf, 0x66 );
 820       }
 821       emit_opcode(*cbuf, 0x0F );
 822       emit_opcode(*cbuf, 0x28 );
 823       emit_rm    (*cbuf, 0x3, Matcher::_regEncode[dst_lo], Matcher::_regEncode[src_lo] );
 824 #ifndef PRODUCT
 825     } else if( !do_size ) {
 826       if( size != 0 ) st->print("\n\t");
 827       if( src_lo+1 == src_hi && dst_lo+1 == dst_hi ) { // double move?
 828         st->print("MOVAPD %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 829       } else {
 830         st->print("MOVAPS %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 831       }
 832 #endif
 833     }
 834     return size + ((src_lo+1 == src_hi && dst_lo+1 == dst_hi) ? 4 : 3);
 835   } else {
 836     if( cbuf ) {
 837       emit_opcode(*cbuf, (src_lo+1 == src_hi && dst_lo+1 == dst_hi) ? 0xF2 : 0xF3 );
 838       emit_opcode(*cbuf, 0x0F );
 839       emit_opcode(*cbuf, 0x10 );
 840       emit_rm    (*cbuf, 0x3, Matcher::_regEncode[dst_lo], Matcher::_regEncode[src_lo] );
 841 #ifndef PRODUCT
 842     } else if( !do_size ) {
 843       if( size != 0 ) st->print("\n\t");
 844       if( src_lo+1 == src_hi && dst_lo+1 == dst_hi ) { // double move?
 845         st->print("MOVSD  %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 846       } else {
 847         st->print("MOVSS  %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 848       }
 849 #endif
 850     }
 851     return size+4;
 852   }
 853 }
 854 
 855 static int impl_mov_helper( CodeBuffer *cbuf, bool do_size, int src, int dst, int size, outputStream* st ) {
 856   if( cbuf ) {
 857     emit_opcode(*cbuf, 0x8B );
 858     emit_rm    (*cbuf, 0x3, Matcher::_regEncode[dst], Matcher::_regEncode[src] );
 859 #ifndef PRODUCT
 860   } else if( !do_size ) {
 861     if( size != 0 ) st->print("\n\t");
 862     st->print("MOV    %s,%s",Matcher::regName[dst],Matcher::regName[src]);
 863 #endif
 864   }
 865   return size+2;
 866 }
 867 
 868 static int impl_fp_store_helper( CodeBuffer *cbuf, bool do_size, int src_lo, int src_hi, int dst_lo, int dst_hi,
 869                                  int offset, int size, outputStream* st ) {
 870   if( src_lo != FPR1L_num ) {      // Move value to top of FP stack, if not already there
 871     if( cbuf ) {
 872       emit_opcode( *cbuf, 0xD9 );  // FLD (i.e., push it)
 873       emit_d8( *cbuf, 0xC0-1+Matcher::_regEncode[src_lo] );
 874 #ifndef PRODUCT
 875     } else if( !do_size ) {
 876       if( size != 0 ) st->print("\n\t");
 877       st->print("FLD    %s",Matcher::regName[src_lo]);
 878 #endif
 879     }
 880     size += 2;
 881   }
 882 
 883   int st_op = (src_lo != FPR1L_num) ? EBX_num /*store & pop*/ : EDX_num /*store no pop*/;
 884   const char *op_str;
 885   int op;
 886   if( src_lo+1 == src_hi && dst_lo+1 == dst_hi ) { // double store?
 887     op_str = (src_lo != FPR1L_num) ? "FSTP_D" : "FST_D ";
 888     op = 0xDD;
 889   } else {                   // 32-bit store
 890     op_str = (src_lo != FPR1L_num) ? "FSTP_S" : "FST_S ";
 891     op = 0xD9;
 892     assert( !OptoReg::is_valid(src_hi) && !OptoReg::is_valid(dst_hi), "no non-adjacent float-stores" );
 893   }
 894 
 895   return impl_helper(cbuf,do_size,false,offset,st_op,op,op_str,size, st);
 896 }
 897 
 898 uint MachSpillCopyNode::implementation( CodeBuffer *cbuf, PhaseRegAlloc *ra_, bool do_size, outputStream* st ) const {
 899   // Get registers to move
 900   OptoReg::Name src_second = ra_->get_reg_second(in(1));
 901   OptoReg::Name src_first = ra_->get_reg_first(in(1));
 902   OptoReg::Name dst_second = ra_->get_reg_second(this );
 903   OptoReg::Name dst_first = ra_->get_reg_first(this );
 904 
 905   enum RC src_second_rc = rc_class(src_second);
 906   enum RC src_first_rc = rc_class(src_first);
 907   enum RC dst_second_rc = rc_class(dst_second);
 908   enum RC dst_first_rc = rc_class(dst_first);
 909 
 910   assert( OptoReg::is_valid(src_first) && OptoReg::is_valid(dst_first), "must move at least 1 register" );
 911 
 912   // Generate spill code!
 913   int size = 0;
 914 
 915   if( src_first == dst_first && src_second == dst_second )
 916     return size;            // Self copy, no move
 917 
 918   // --------------------------------------
 919   // Check for mem-mem move.  push/pop to move.
 920   if( src_first_rc == rc_stack && dst_first_rc == rc_stack ) {
 921     if( src_second == dst_first ) { // overlapping stack copy ranges
 922       assert( src_second_rc == rc_stack && dst_second_rc == rc_stack, "we only expect a stk-stk copy here" );
 923       size = impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_second),ESI_num,0xFF,"PUSH  ",size, st);
 924       size = impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_second),EAX_num,0x8F,"POP   ",size, st);
 925       src_second_rc = dst_second_rc = rc_bad;  // flag as already moved the second bits
 926     }
 927     // move low bits
 928     size = impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_first),ESI_num,0xFF,"PUSH  ",size, st);
 929     size = impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_first),EAX_num,0x8F,"POP   ",size, st);
 930     if( src_second_rc == rc_stack && dst_second_rc == rc_stack ) { // mov second bits
 931       size = impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_second),ESI_num,0xFF,"PUSH  ",size, st);
 932       size = impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_second),EAX_num,0x8F,"POP   ",size, st);
 933     }
 934     return size;
 935   }
 936 
 937   // --------------------------------------
 938   // Check for integer reg-reg copy
 939   if( src_first_rc == rc_int && dst_first_rc == rc_int )
 940     size = impl_mov_helper(cbuf,do_size,src_first,dst_first,size, st);
 941 
 942   // Check for integer store
 943   if( src_first_rc == rc_int && dst_first_rc == rc_stack )
 944     size = impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_first),src_first,0x89,"MOV ",size, st);
 945 
 946   // Check for integer load
 947   if( dst_first_rc == rc_int && src_first_rc == rc_stack )
 948     size = impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_first),dst_first,0x8B,"MOV ",size, st);
 949 
 950   // --------------------------------------
 951   // Check for float reg-reg copy
 952   if( src_first_rc == rc_float && dst_first_rc == rc_float ) {
 953     assert( (src_second_rc == rc_bad && dst_second_rc == rc_bad) ||
 954             (src_first+1 == src_second && dst_first+1 == dst_second), "no non-adjacent float-moves" );
 955     if( cbuf ) {
 956 
 957       // Note the mucking with the register encode to compensate for the 0/1
 958       // indexing issue mentioned in a comment in the reg_def sections
 959       // for FPR registers many lines above here.
 960 
 961       if( src_first != FPR1L_num ) {
 962         emit_opcode  (*cbuf, 0xD9 );           // FLD    ST(i)
 963         emit_d8      (*cbuf, 0xC0+Matcher::_regEncode[src_first]-1 );
 964         emit_opcode  (*cbuf, 0xDD );           // FSTP   ST(i)
 965         emit_d8      (*cbuf, 0xD8+Matcher::_regEncode[dst_first] );
 966      } else {
 967         emit_opcode  (*cbuf, 0xDD );           // FST    ST(i)
 968         emit_d8      (*cbuf, 0xD0+Matcher::_regEncode[dst_first]-1 );
 969      }
 970 #ifndef PRODUCT
 971     } else if( !do_size ) {
 972       if( size != 0 ) st->print("\n\t");
 973       if( src_first != FPR1L_num ) st->print("FLD    %s\n\tFSTP   %s",Matcher::regName[src_first],Matcher::regName[dst_first]);
 974       else                      st->print(             "FST    %s",                            Matcher::regName[dst_first]);
 975 #endif
 976     }
 977     return size + ((src_first != FPR1L_num) ? 2+2 : 2);
 978   }
 979 
 980   // Check for float store
 981   if( src_first_rc == rc_float && dst_first_rc == rc_stack ) {
 982     return impl_fp_store_helper(cbuf,do_size,src_first,src_second,dst_first,dst_second,ra_->reg2offset(dst_first),size, st);
 983   }
 984 
 985   // Check for float load
 986   if( dst_first_rc == rc_float && src_first_rc == rc_stack ) {
 987     int offset = ra_->reg2offset(src_first);
 988     const char *op_str;
 989     int op;
 990     if( src_first+1 == src_second && dst_first+1 == dst_second ) { // double load?
 991       op_str = "FLD_D";
 992       op = 0xDD;
 993     } else {                   // 32-bit load
 994       op_str = "FLD_S";
 995       op = 0xD9;
 996       assert( src_second_rc == rc_bad && dst_second_rc == rc_bad, "no non-adjacent float-loads" );
 997     }
 998     if( cbuf ) {
 999       emit_opcode  (*cbuf, op );
1000       encode_RegMem(*cbuf, 0x0, ESP_enc, 0x4, 0, offset, false);
1001       emit_opcode  (*cbuf, 0xDD );           // FSTP   ST(i)
1002       emit_d8      (*cbuf, 0xD8+Matcher::_regEncode[dst_first] );
1003 #ifndef PRODUCT
1004     } else if( !do_size ) {
1005       if( size != 0 ) st->print("\n\t");
1006       st->print("%s  ST,[ESP + #%d]\n\tFSTP   %s",op_str, offset,Matcher::regName[dst_first]);
1007 #endif
1008     }
1009     int offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4);
1010     return size + 3+offset_size+2;
1011   }
1012 
1013   // Check for xmm reg-reg copy
1014   if( src_first_rc == rc_xmm && dst_first_rc == rc_xmm ) {
1015     assert( (src_second_rc == rc_bad && dst_second_rc == rc_bad) ||
1016             (src_first+1 == src_second && dst_first+1 == dst_second),
1017             "no non-adjacent float-moves" );
1018     return impl_movx_helper(cbuf,do_size,src_first,dst_first,src_second, dst_second, size, st);
1019   }
1020 
1021   // Check for xmm store
1022   if( src_first_rc == rc_xmm && dst_first_rc == rc_stack ) {
1023     return impl_x_helper(cbuf,do_size,false,ra_->reg2offset(dst_first),src_first, src_second, size, st);
1024   }
1025 
1026   // Check for float xmm load
1027   if( dst_first_rc == rc_xmm && src_first_rc == rc_stack ) {
1028     return impl_x_helper(cbuf,do_size,true ,ra_->reg2offset(src_first),dst_first, dst_second, size, st);
1029   }
1030 
1031   // Copy from float reg to xmm reg
1032   if( dst_first_rc == rc_xmm && src_first_rc == rc_float ) {
1033     // copy to the top of stack from floating point reg
1034     // and use LEA to preserve flags
1035     if( cbuf ) {
1036       emit_opcode(*cbuf,0x8D);  // LEA  ESP,[ESP-8]
1037       emit_rm(*cbuf, 0x1, ESP_enc, 0x04);
1038       emit_rm(*cbuf, 0x0, 0x04, ESP_enc);
1039       emit_d8(*cbuf,0xF8);
1040 #ifndef PRODUCT
1041     } else if( !do_size ) {
1042       if( size != 0 ) st->print("\n\t");
1043       st->print("LEA    ESP,[ESP-8]");
1044 #endif
1045     }
1046     size += 4;
1047 
1048     size = impl_fp_store_helper(cbuf,do_size,src_first,src_second,dst_first,dst_second,0,size, st);
1049 
1050     // Copy from the temp memory to the xmm reg.
1051     size = impl_x_helper(cbuf,do_size,true ,0,dst_first, dst_second, size, st);
1052 
1053     if( cbuf ) {
1054       emit_opcode(*cbuf,0x8D);  // LEA  ESP,[ESP+8]
1055       emit_rm(*cbuf, 0x1, ESP_enc, 0x04);
1056       emit_rm(*cbuf, 0x0, 0x04, ESP_enc);
1057       emit_d8(*cbuf,0x08);
1058 #ifndef PRODUCT
1059     } else if( !do_size ) {
1060       if( size != 0 ) st->print("\n\t");
1061       st->print("LEA    ESP,[ESP+8]");
1062 #endif
1063     }
1064     size += 4;
1065     return size;
1066   }
1067 
1068   assert( size > 0, "missed a case" );
1069 
1070   // --------------------------------------------------------------------
1071   // Check for second bits still needing moving.
1072   if( src_second == dst_second )
1073     return size;               // Self copy; no move
1074   assert( src_second_rc != rc_bad && dst_second_rc != rc_bad, "src_second & dst_second cannot be Bad" );
1075 
1076   // Check for second word int-int move
1077   if( src_second_rc == rc_int && dst_second_rc == rc_int )
1078     return impl_mov_helper(cbuf,do_size,src_second,dst_second,size, st);
1079 
1080   // Check for second word integer store
1081   if( src_second_rc == rc_int && dst_second_rc == rc_stack )
1082     return impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_second),src_second,0x89,"MOV ",size, st);
1083 
1084   // Check for second word integer load
1085   if( dst_second_rc == rc_int && src_second_rc == rc_stack )
1086     return impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_second),dst_second,0x8B,"MOV ",size, st);
1087 
1088 
1089   Unimplemented();
1090 }
1091 
1092 #ifndef PRODUCT
1093 void MachSpillCopyNode::format( PhaseRegAlloc *ra_, outputStream* st ) const {
1094   implementation( NULL, ra_, false, st );
1095 }
1096 #endif
1097 
1098 void MachSpillCopyNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
1099   implementation( &cbuf, ra_, false, NULL );
1100 }
1101 
1102 uint MachSpillCopyNode::size(PhaseRegAlloc *ra_) const {
1103   return implementation( NULL, ra_, true, NULL );
1104 }
1105 
1106 //=============================================================================
1107 #ifndef PRODUCT
1108 void MachNopNode::format( PhaseRegAlloc *, outputStream* st ) const {
1109   st->print("NOP \t# %d bytes pad for loops and calls", _count);
1110 }
1111 #endif
1112 
1113 void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc * ) const {
1114   MacroAssembler _masm(&cbuf);
1115   __ nop(_count);
1116 }
1117 
1118 uint MachNopNode::size(PhaseRegAlloc *) const {
1119   return _count;
1120 }
1121 
1122 
1123 //=============================================================================
1124 #ifndef PRODUCT
1125 void BoxLockNode::format( PhaseRegAlloc *ra_, outputStream* st ) const {
1126   int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
1127   int reg = ra_->get_reg_first(this);
1128   st->print("LEA    %s,[ESP + #%d]",Matcher::regName[reg],offset);
1129 }
1130 #endif
1131 
1132 void BoxLockNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
1133   int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
1134   int reg = ra_->get_encode(this);
1135   if( offset >= 128 ) {
1136     emit_opcode(cbuf, 0x8D);      // LEA  reg,[SP+offset]
1137     emit_rm(cbuf, 0x2, reg, 0x04);
1138     emit_rm(cbuf, 0x0, 0x04, ESP_enc);
1139     emit_d32(cbuf, offset);
1140   }
1141   else {
1142     emit_opcode(cbuf, 0x8D);      // LEA  reg,[SP+offset]
1143     emit_rm(cbuf, 0x1, reg, 0x04);
1144     emit_rm(cbuf, 0x0, 0x04, ESP_enc);
1145     emit_d8(cbuf, offset);
1146   }
1147 }
1148 
1149 uint BoxLockNode::size(PhaseRegAlloc *ra_) const {
1150   int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
1151   if( offset >= 128 ) {
1152     return 7;
1153   }
1154   else {
1155     return 4;
1156   }
1157 }
1158 
1159 //=============================================================================
1160 
1161 // emit call stub, compiled java to interpreter
1162 void emit_java_to_interp(CodeBuffer &cbuf ) {
1163   // Stub is fixed up when the corresponding call is converted from calling
1164   // compiled code to calling interpreted code.
1165   // mov rbx,0
1166   // jmp -1
1167 
1168   address mark = cbuf.inst_mark();  // get mark within main instrs section
1169 
1170   // Note that the code buffer's inst_mark is always relative to insts.
1171   // That's why we must use the macroassembler to generate a stub.
1172   MacroAssembler _masm(&cbuf);
1173 
1174   address base =
1175   __ start_a_stub(Compile::MAX_stubs_size);
1176   if (base == NULL)  return;  // CodeBuffer::expand failed
1177   // static stub relocation stores the instruction address of the call
1178   __ relocate(static_stub_Relocation::spec(mark), RELOC_IMM32);
1179   // static stub relocation also tags the methodOop in the code-stream.
1180   __ movoop(rbx, (jobject)NULL);  // method is zapped till fixup time
1181   // This is recognized as unresolved by relocs/nativeInst/ic code
1182   __ jump(RuntimeAddress(__ pc()));
1183 
1184   __ end_a_stub();
1185   // Update current stubs pointer and restore code_end.
1186 }
1187 // size of call stub, compiled java to interpretor
1188 uint size_java_to_interp() {
1189   return 10;  // movl; jmp
1190 }
1191 // relocation entries for call stub, compiled java to interpretor
1192 uint reloc_java_to_interp() {
1193   return 4;  // 3 in emit_java_to_interp + 1 in Java_Static_Call
1194 }
1195 
1196 //=============================================================================
1197 #ifndef PRODUCT
1198 void MachUEPNode::format( PhaseRegAlloc *ra_, outputStream* st ) const {
1199   st->print_cr(  "CMP    EAX,[ECX+4]\t# Inline cache check");
1200   st->print_cr("\tJNE    SharedRuntime::handle_ic_miss_stub");
1201   st->print_cr("\tNOP");
1202   st->print_cr("\tNOP");
1203   if( !OptoBreakpoint )
1204     st->print_cr("\tNOP");
1205 }
1206 #endif
1207 
1208 void MachUEPNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
1209   MacroAssembler masm(&cbuf);
1210 #ifdef ASSERT
1211   uint code_size = cbuf.code_size();
1212 #endif
1213   masm.cmpptr(rax, Address(rcx, oopDesc::klass_offset_in_bytes()));
1214   masm.jump_cc(Assembler::notEqual,
1215                RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
1216   /* WARNING these NOPs are critical so that verified entry point is properly
1217      aligned for patching by NativeJump::patch_verified_entry() */
1218   int nops_cnt = 2;
1219   if( !OptoBreakpoint ) // Leave space for int3
1220      nops_cnt += 1;
1221   masm.nop(nops_cnt);
1222 
1223   assert(cbuf.code_size() - code_size == size(ra_), "checking code size of inline cache node");
1224 }
1225 
1226 uint MachUEPNode::size(PhaseRegAlloc *ra_) const {
1227   return OptoBreakpoint ? 11 : 12;
1228 }
1229 
1230 
1231 //=============================================================================
1232 uint size_exception_handler() {
1233   // NativeCall instruction size is the same as NativeJump.
1234   // exception handler starts out as jump and can be patched to
1235   // a call be deoptimization.  (4932387)
1236   // Note that this value is also credited (in output.cpp) to
1237   // the size of the code section.
1238   return NativeJump::instruction_size;
1239 }
1240 
1241 // Emit exception handler code.  Stuff framesize into a register
1242 // and call a VM stub routine.
1243 int emit_exception_handler(CodeBuffer& cbuf) {
1244 
1245   // Note that the code buffer's inst_mark is always relative to insts.
1246   // That's why we must use the macroassembler to generate a handler.
1247   MacroAssembler _masm(&cbuf);
1248   address base =
1249   __ start_a_stub(size_exception_handler());
1250   if (base == NULL)  return 0;  // CodeBuffer::expand failed
1251   int offset = __ offset();
1252   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->instructions_begin()));
1253   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1254   __ end_a_stub();
1255   return offset;
1256 }
1257 
1258 uint size_deopt_handler() {
1259   // NativeCall instruction size is the same as NativeJump.
1260   // exception handler starts out as jump and can be patched to
1261   // a call be deoptimization.  (4932387)
1262   // Note that this value is also credited (in output.cpp) to
1263   // the size of the code section.
1264   return 5 + NativeJump::instruction_size; // pushl(); jmp;
1265 }
1266 
1267 // Emit deopt handler code.
1268 int emit_deopt_handler(CodeBuffer& cbuf) {
1269 
1270   // Note that the code buffer's inst_mark is always relative to insts.
1271   // That's why we must use the macroassembler to generate a handler.
1272   MacroAssembler _masm(&cbuf);
1273   address base =
1274   __ start_a_stub(size_exception_handler());
1275   if (base == NULL)  return 0;  // CodeBuffer::expand failed
1276   int offset = __ offset();
1277   InternalAddress here(__ pc());
1278   __ pushptr(here.addr());
1279 
1280   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1281   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow");
1282   __ end_a_stub();
1283   return offset;
1284 }
1285 
1286 
1287 static void emit_double_constant(CodeBuffer& cbuf, double x) {
1288   int mark = cbuf.insts()->mark_off();
1289   MacroAssembler _masm(&cbuf);
1290   address double_address = __ double_constant(x);
1291   cbuf.insts()->set_mark_off(mark);  // preserve mark across masm shift
1292   emit_d32_reloc(cbuf,
1293                  (int)double_address,
1294                  internal_word_Relocation::spec(double_address),
1295                  RELOC_DISP32);
1296 }
1297 
1298 static void emit_float_constant(CodeBuffer& cbuf, float x) {
1299   int mark = cbuf.insts()->mark_off();
1300   MacroAssembler _masm(&cbuf);
1301   address float_address = __ float_constant(x);
1302   cbuf.insts()->set_mark_off(mark);  // preserve mark across masm shift
1303   emit_d32_reloc(cbuf,
1304                  (int)float_address,
1305                  internal_word_Relocation::spec(float_address),
1306                  RELOC_DISP32);
1307 }
1308 
1309 
1310 const bool Matcher::match_rule_supported(int opcode) {
1311   if (!has_match_rule(opcode))
1312     return false;
1313 
1314   return true;  // Per default match rules are supported.
1315 }
1316 
1317 int Matcher::regnum_to_fpu_offset(int regnum) {
1318   return regnum - 32; // The FP registers are in the second chunk
1319 }
1320 
1321 bool is_positive_zero_float(jfloat f) {
1322   return jint_cast(f) == jint_cast(0.0F);
1323 }
1324 
1325 bool is_positive_one_float(jfloat f) {
1326   return jint_cast(f) == jint_cast(1.0F);
1327 }
1328 
1329 bool is_positive_zero_double(jdouble d) {
1330   return jlong_cast(d) == jlong_cast(0.0);
1331 }
1332 
1333 bool is_positive_one_double(jdouble d) {
1334   return jlong_cast(d) == jlong_cast(1.0);
1335 }
1336 
1337 // This is UltraSparc specific, true just means we have fast l2f conversion
1338 const bool Matcher::convL2FSupported(void) {
1339   return true;
1340 }
1341 
1342 // Vector width in bytes
1343 const uint Matcher::vector_width_in_bytes(void) {
1344   return UseSSE >= 2 ? 8 : 0;
1345 }
1346 
1347 // Vector ideal reg
1348 const uint Matcher::vector_ideal_reg(void) {
1349   return Op_RegD;
1350 }
1351 
1352 // Is this branch offset short enough that a short branch can be used?
1353 //
1354 // NOTE: If the platform does not provide any short branch variants, then
1355 //       this method should return false for offset 0.
1356 bool Matcher::is_short_branch_offset(int rule, int offset) {
1357   // the short version of jmpConUCF2 contains multiple branches,
1358   // making the reach slightly less
1359   if (rule == jmpConUCF2_rule)
1360     return (-126 <= offset && offset <= 125);
1361   return (-128 <= offset && offset <= 127);
1362 }
1363 
1364 const bool Matcher::isSimpleConstant64(jlong value) {
1365   // Will one (StoreL ConL) be cheaper than two (StoreI ConI)?.
1366   return false;
1367 }
1368 
1369 // The ecx parameter to rep stos for the ClearArray node is in dwords.
1370 const bool Matcher::init_array_count_is_in_bytes = false;
1371 
1372 // Threshold size for cleararray.
1373 const int Matcher::init_array_short_size = 8 * BytesPerLong;
1374 
1375 // Should the Matcher clone shifts on addressing modes, expecting them to
1376 // be subsumed into complex addressing expressions or compute them into
1377 // registers?  True for Intel but false for most RISCs
1378 const bool Matcher::clone_shift_expressions = true;
1379 
1380 // Is it better to copy float constants, or load them directly from memory?
1381 // Intel can load a float constant from a direct address, requiring no
1382 // extra registers.  Most RISCs will have to materialize an address into a
1383 // register first, so they would do better to copy the constant from stack.
1384 const bool Matcher::rematerialize_float_constants = true;
1385 
1386 // If CPU can load and store mis-aligned doubles directly then no fixup is
1387 // needed.  Else we split the double into 2 integer pieces and move it
1388 // piece-by-piece.  Only happens when passing doubles into C code as the
1389 // Java calling convention forces doubles to be aligned.
1390 const bool Matcher::misaligned_doubles_ok = true;
1391 
1392 
1393 void Matcher::pd_implicit_null_fixup(MachNode *node, uint idx) {
1394   // Get the memory operand from the node
1395   uint numopnds = node->num_opnds();        // Virtual call for number of operands
1396   uint skipped  = node->oper_input_base();  // Sum of leaves skipped so far
1397   assert( idx >= skipped, "idx too low in pd_implicit_null_fixup" );
1398   uint opcnt     = 1;                 // First operand
1399   uint num_edges = node->_opnds[1]->num_edges(); // leaves for first operand
1400   while( idx >= skipped+num_edges ) {
1401     skipped += num_edges;
1402     opcnt++;                          // Bump operand count
1403     assert( opcnt < numopnds, "Accessing non-existent operand" );
1404     num_edges = node->_opnds[opcnt]->num_edges(); // leaves for next operand
1405   }
1406 
1407   MachOper *memory = node->_opnds[opcnt];
1408   MachOper *new_memory = NULL;
1409   switch (memory->opcode()) {
1410   case DIRECT:
1411   case INDOFFSET32X:
1412     // No transformation necessary.
1413     return;
1414   case INDIRECT:
1415     new_memory = new (C) indirect_win95_safeOper( );
1416     break;
1417   case INDOFFSET8:
1418     new_memory = new (C) indOffset8_win95_safeOper(memory->disp(NULL, NULL, 0));
1419     break;
1420   case INDOFFSET32:
1421     new_memory = new (C) indOffset32_win95_safeOper(memory->disp(NULL, NULL, 0));
1422     break;
1423   case INDINDEXOFFSET:
1424     new_memory = new (C) indIndexOffset_win95_safeOper(memory->disp(NULL, NULL, 0));
1425     break;
1426   case INDINDEXSCALE:
1427     new_memory = new (C) indIndexScale_win95_safeOper(memory->scale());
1428     break;
1429   case INDINDEXSCALEOFFSET:
1430     new_memory = new (C) indIndexScaleOffset_win95_safeOper(memory->scale(), memory->disp(NULL, NULL, 0));
1431     break;
1432   case LOAD_LONG_INDIRECT:
1433   case LOAD_LONG_INDOFFSET32:
1434     // Does not use EBP as address register, use { EDX, EBX, EDI, ESI}
1435     return;
1436   default:
1437     assert(false, "unexpected memory operand in pd_implicit_null_fixup()");
1438     return;
1439   }
1440   node->_opnds[opcnt] = new_memory;
1441 }
1442 
1443 // Advertise here if the CPU requires explicit rounding operations
1444 // to implement the UseStrictFP mode.
1445 const bool Matcher::strict_fp_requires_explicit_rounding = true;
1446 
1447 // Do floats take an entire double register or just half?
1448 const bool Matcher::float_in_double = true;
1449 // Do ints take an entire long register or just half?
1450 const bool Matcher::int_in_long = false;
1451 
1452 // Return whether or not this register is ever used as an argument.  This
1453 // function is used on startup to build the trampoline stubs in generateOptoStub.
1454 // Registers not mentioned will be killed by the VM call in the trampoline, and
1455 // arguments in those registers not be available to the callee.
1456 bool Matcher::can_be_java_arg( int reg ) {
1457   if(  reg == ECX_num   || reg == EDX_num   ) return true;
1458   if( (reg == XMM0a_num || reg == XMM1a_num) && UseSSE>=1 ) return true;
1459   if( (reg == XMM0b_num || reg == XMM1b_num) && UseSSE>=2 ) return true;
1460   return false;
1461 }
1462 
1463 bool Matcher::is_spillable_arg( int reg ) {
1464   return can_be_java_arg(reg);
1465 }
1466 
1467 // Register for DIVI projection of divmodI
1468 RegMask Matcher::divI_proj_mask() {
1469   return EAX_REG_mask;
1470 }
1471 
1472 // Register for MODI projection of divmodI
1473 RegMask Matcher::modI_proj_mask() {
1474   return EDX_REG_mask;
1475 }
1476 
1477 // Register for DIVL projection of divmodL
1478 RegMask Matcher::divL_proj_mask() {
1479   ShouldNotReachHere();
1480   return RegMask();
1481 }
1482 
1483 // Register for MODL projection of divmodL
1484 RegMask Matcher::modL_proj_mask() {
1485   ShouldNotReachHere();
1486   return RegMask();
1487 }
1488 
1489 const RegMask Matcher::method_handle_invoke_SP_save_mask() {
1490   return EBP_REG_mask;
1491 }
1492 
1493 // Returns true if the high 32 bits of the left (when isLeft==true) or the
1494 // right (when isLeft==false) operand is known to be zero.
1495 bool is_mulL_operand_hi32_zero(MulLNode* n, bool isLeft) {
1496   Node* o = isLeft ? n->in(1) : n->in(2);
1497   if (o->Opcode() == Op_LoadUI2L) {
1498     return true;
1499   }
1500   if (o->Opcode() == Op_AndL) {
1501     Node* o2 = o->in(2);
1502     if (o2->is_Con() && (o2->get_long() & 0xFFFFFFFF00000000LL) == 0LL) {
1503       return true;
1504     }
1505   }
1506   return false;
1507 }
1508 
1509 %}
1510 
1511 //----------ENCODING BLOCK-----------------------------------------------------
1512 // This block specifies the encoding classes used by the compiler to output
1513 // byte streams.  Encoding classes generate functions which are called by
1514 // Machine Instruction Nodes in order to generate the bit encoding of the
1515 // instruction.  Operands specify their base encoding interface with the
1516 // interface keyword.  There are currently supported four interfaces,
1517 // REG_INTER, CONST_INTER, MEMORY_INTER, & COND_INTER.  REG_INTER causes an
1518 // operand to generate a function which returns its register number when
1519 // queried.   CONST_INTER causes an operand to generate a function which
1520 // returns the value of the constant when queried.  MEMORY_INTER causes an
1521 // operand to generate four functions which return the Base Register, the
1522 // Index Register, the Scale Value, and the Offset Value of the operand when
1523 // queried.  COND_INTER causes an operand to generate six functions which
1524 // return the encoding code (ie - encoding bits for the instruction)
1525 // associated with each basic boolean condition for a conditional instruction.
1526 // Instructions specify two basic values for encoding.  They use the
1527 // ins_encode keyword to specify their encoding class (which must be one of
1528 // the class names specified in the encoding block), and they use the
1529 // opcode keyword to specify, in order, their primary, secondary, and
1530 // tertiary opcode.  Only the opcode sections which a particular instruction
1531 // needs for encoding need to be specified.
1532 encode %{
1533   // Build emit functions for each basic byte or larger field in the intel
1534   // encoding scheme (opcode, rm, sib, immediate), and call them from C++
1535   // code in the enc_class source block.  Emit functions will live in the
1536   // main source block for now.  In future, we can generalize this by
1537   // adding a syntax that specifies the sizes of fields in an order,
1538   // so that the adlc can build the emit functions automagically
1539 
1540   // Emit primary opcode
1541   enc_class OpcP %{
1542     emit_opcode(cbuf, $primary);
1543   %}
1544 
1545   // Emit secondary opcode
1546   enc_class OpcS %{
1547     emit_opcode(cbuf, $secondary);
1548   %}
1549 
1550   // Emit opcode directly
1551   enc_class Opcode(immI d8) %{
1552     emit_opcode(cbuf, $d8$$constant);
1553   %}
1554 
1555   enc_class SizePrefix %{
1556     emit_opcode(cbuf,0x66);
1557   %}
1558 
1559   enc_class RegReg (eRegI dst, eRegI src) %{    // RegReg(Many)
1560     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
1561   %}
1562 
1563   enc_class OpcRegReg (immI opcode, eRegI dst, eRegI src) %{    // OpcRegReg(Many)
1564     emit_opcode(cbuf,$opcode$$constant);
1565     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
1566   %}
1567 
1568   enc_class mov_r32_imm0( eRegI dst ) %{
1569     emit_opcode( cbuf, 0xB8 + $dst$$reg ); // 0xB8+ rd   -- MOV r32  ,imm32
1570     emit_d32   ( cbuf, 0x0  );             //                         imm32==0x0
1571   %}
1572 
1573   enc_class cdq_enc %{
1574     // Full implementation of Java idiv and irem; checks for
1575     // special case as described in JVM spec., p.243 & p.271.
1576     //
1577     //         normal case                           special case
1578     //
1579     // input : rax,: dividend                         min_int
1580     //         reg: divisor                          -1
1581     //
1582     // output: rax,: quotient  (= rax, idiv reg)       min_int
1583     //         rdx: remainder (= rax, irem reg)       0
1584     //
1585     //  Code sequnce:
1586     //
1587     //  81 F8 00 00 00 80    cmp         rax,80000000h
1588     //  0F 85 0B 00 00 00    jne         normal_case
1589     //  33 D2                xor         rdx,edx
1590     //  83 F9 FF             cmp         rcx,0FFh
1591     //  0F 84 03 00 00 00    je          done
1592     //                  normal_case:
1593     //  99                   cdq
1594     //  F7 F9                idiv        rax,ecx
1595     //                  done:
1596     //
1597     emit_opcode(cbuf,0x81); emit_d8(cbuf,0xF8);
1598     emit_opcode(cbuf,0x00); emit_d8(cbuf,0x00);
1599     emit_opcode(cbuf,0x00); emit_d8(cbuf,0x80);                     // cmp rax,80000000h
1600     emit_opcode(cbuf,0x0F); emit_d8(cbuf,0x85);
1601     emit_opcode(cbuf,0x0B); emit_d8(cbuf,0x00);
1602     emit_opcode(cbuf,0x00); emit_d8(cbuf,0x00);                     // jne normal_case
1603     emit_opcode(cbuf,0x33); emit_d8(cbuf,0xD2);                     // xor rdx,edx
1604     emit_opcode(cbuf,0x83); emit_d8(cbuf,0xF9); emit_d8(cbuf,0xFF); // cmp rcx,0FFh
1605     emit_opcode(cbuf,0x0F); emit_d8(cbuf,0x84);
1606     emit_opcode(cbuf,0x03); emit_d8(cbuf,0x00);
1607     emit_opcode(cbuf,0x00); emit_d8(cbuf,0x00);                     // je done
1608     // normal_case:
1609     emit_opcode(cbuf,0x99);                                         // cdq
1610     // idiv (note: must be emitted by the user of this rule)
1611     // normal:
1612   %}
1613 
1614   // Dense encoding for older common ops
1615   enc_class Opc_plus(immI opcode, eRegI reg) %{
1616     emit_opcode(cbuf, $opcode$$constant + $reg$$reg);
1617   %}
1618 
1619 
1620   // Opcde enc_class for 8/32 bit immediate instructions with sign-extension
1621   enc_class OpcSE (immI imm) %{ // Emit primary opcode and set sign-extend bit
1622     // Check for 8-bit immediate, and set sign extend bit in opcode
1623     if (($imm$$constant >= -128) && ($imm$$constant <= 127)) {
1624       emit_opcode(cbuf, $primary | 0x02);
1625     }
1626     else {                          // If 32-bit immediate
1627       emit_opcode(cbuf, $primary);
1628     }
1629   %}
1630 
1631   enc_class OpcSErm (eRegI dst, immI imm) %{    // OpcSEr/m
1632     // Emit primary opcode and set sign-extend bit
1633     // Check for 8-bit immediate, and set sign extend bit in opcode
1634     if (($imm$$constant >= -128) && ($imm$$constant <= 127)) {
1635       emit_opcode(cbuf, $primary | 0x02);    }
1636     else {                          // If 32-bit immediate
1637       emit_opcode(cbuf, $primary);
1638     }
1639     // Emit r/m byte with secondary opcode, after primary opcode.
1640     emit_rm(cbuf, 0x3, $secondary, $dst$$reg);
1641   %}
1642 
1643   enc_class Con8or32 (immI imm) %{    // Con8or32(storeImmI), 8 or 32 bits
1644     // Check for 8-bit immediate, and set sign extend bit in opcode
1645     if (($imm$$constant >= -128) && ($imm$$constant <= 127)) {
1646       $$$emit8$imm$$constant;
1647     }
1648     else {                          // If 32-bit immediate
1649       // Output immediate
1650       $$$emit32$imm$$constant;
1651     }
1652   %}
1653 
1654   enc_class Long_OpcSErm_Lo(eRegL dst, immL imm) %{
1655     // Emit primary opcode and set sign-extend bit
1656     // Check for 8-bit immediate, and set sign extend bit in opcode
1657     int con = (int)$imm$$constant; // Throw away top bits
1658     emit_opcode(cbuf, ((con >= -128) && (con <= 127)) ? ($primary | 0x02) : $primary);
1659     // Emit r/m byte with secondary opcode, after primary opcode.
1660     emit_rm(cbuf, 0x3, $secondary, $dst$$reg);
1661     if ((con >= -128) && (con <= 127)) emit_d8 (cbuf,con);
1662     else                               emit_d32(cbuf,con);
1663   %}
1664 
1665   enc_class Long_OpcSErm_Hi(eRegL dst, immL imm) %{
1666     // Emit primary opcode and set sign-extend bit
1667     // Check for 8-bit immediate, and set sign extend bit in opcode
1668     int con = (int)($imm$$constant >> 32); // Throw away bottom bits
1669     emit_opcode(cbuf, ((con >= -128) && (con <= 127)) ? ($primary | 0x02) : $primary);
1670     // Emit r/m byte with tertiary opcode, after primary opcode.
1671     emit_rm(cbuf, 0x3, $tertiary, HIGH_FROM_LOW($dst$$reg));
1672     if ((con >= -128) && (con <= 127)) emit_d8 (cbuf,con);
1673     else                               emit_d32(cbuf,con);
1674   %}
1675 
1676   enc_class Lbl (label labl) %{ // JMP, CALL
1677     Label *l = $labl$$label;
1678     emit_d32(cbuf, l ? (l->loc_pos() - (cbuf.code_size()+4)) : 0);
1679   %}
1680 
1681   enc_class LblShort (label labl) %{ // JMP, CALL
1682     Label *l = $labl$$label;
1683     int disp = l ? (l->loc_pos() - (cbuf.code_size()+1)) : 0;
1684     assert(-128 <= disp && disp <= 127, "Displacement too large for short jmp");
1685     emit_d8(cbuf, disp);
1686   %}
1687 
1688   enc_class OpcSReg (eRegI dst) %{    // BSWAP
1689     emit_cc(cbuf, $secondary, $dst$$reg );
1690   %}
1691 
1692   enc_class bswap_long_bytes(eRegL dst) %{ // BSWAP
1693     int destlo = $dst$$reg;
1694     int desthi = HIGH_FROM_LOW(destlo);
1695     // bswap lo
1696     emit_opcode(cbuf, 0x0F);
1697     emit_cc(cbuf, 0xC8, destlo);
1698     // bswap hi
1699     emit_opcode(cbuf, 0x0F);
1700     emit_cc(cbuf, 0xC8, desthi);
1701     // xchg lo and hi
1702     emit_opcode(cbuf, 0x87);
1703     emit_rm(cbuf, 0x3, destlo, desthi);
1704   %}
1705 
1706   enc_class RegOpc (eRegI div) %{    // IDIV, IMOD, JMP indirect, ...
1707     emit_rm(cbuf, 0x3, $secondary, $div$$reg );
1708   %}
1709 
1710   enc_class Jcc (cmpOp cop, label labl) %{    // JCC
1711     Label *l = $labl$$label;
1712     $$$emit8$primary;
1713     emit_cc(cbuf, $secondary, $cop$$cmpcode);
1714     emit_d32(cbuf, l ? (l->loc_pos() - (cbuf.code_size()+4)) : 0);
1715   %}
1716 
1717   enc_class JccShort (cmpOp cop, label labl) %{    // JCC
1718     Label *l = $labl$$label;
1719     emit_cc(cbuf, $primary, $cop$$cmpcode);
1720     int disp = l ? (l->loc_pos() - (cbuf.code_size()+1)) : 0;
1721     assert(-128 <= disp && disp <= 127, "Displacement too large for short jmp");
1722     emit_d8(cbuf, disp);
1723   %}
1724 
1725   enc_class enc_cmov(cmpOp cop ) %{ // CMOV
1726     $$$emit8$primary;
1727     emit_cc(cbuf, $secondary, $cop$$cmpcode);
1728   %}
1729 
1730   enc_class enc_cmov_d(cmpOp cop, regD src ) %{ // CMOV
1731     int op = 0xDA00 + $cop$$cmpcode + ($src$$reg-1);
1732     emit_d8(cbuf, op >> 8 );
1733     emit_d8(cbuf, op & 255);
1734   %}
1735 
1736   // emulate a CMOV with a conditional branch around a MOV
1737   enc_class enc_cmov_branch( cmpOp cop, immI brOffs ) %{ // CMOV
1738     // Invert sense of branch from sense of CMOV
1739     emit_cc( cbuf, 0x70, ($cop$$cmpcode^1) );
1740     emit_d8( cbuf, $brOffs$$constant );
1741   %}
1742 
1743   enc_class enc_PartialSubtypeCheck( ) %{
1744     Register Redi = as_Register(EDI_enc); // result register
1745     Register Reax = as_Register(EAX_enc); // super class
1746     Register Recx = as_Register(ECX_enc); // killed
1747     Register Resi = as_Register(ESI_enc); // sub class
1748     Label miss;
1749 
1750     MacroAssembler _masm(&cbuf);
1751     __ check_klass_subtype_slow_path(Resi, Reax, Recx, Redi,
1752                                      NULL, &miss,
1753                                      /*set_cond_codes:*/ true);
1754     if ($primary) {
1755       __ xorptr(Redi, Redi);
1756     }
1757     __ bind(miss);
1758   %}
1759 
1760   enc_class FFree_Float_Stack_All %{    // Free_Float_Stack_All
1761     MacroAssembler masm(&cbuf);
1762     int start = masm.offset();
1763     if (UseSSE >= 2) {
1764       if (VerifyFPU) {
1765         masm.verify_FPU(0, "must be empty in SSE2+ mode");
1766       }
1767     } else {
1768       // External c_calling_convention expects the FPU stack to be 'clean'.
1769       // Compiled code leaves it dirty.  Do cleanup now.
1770       masm.empty_FPU_stack();
1771     }
1772     if (sizeof_FFree_Float_Stack_All == -1) {
1773       sizeof_FFree_Float_Stack_All = masm.offset() - start;
1774     } else {
1775       assert(masm.offset() - start == sizeof_FFree_Float_Stack_All, "wrong size");
1776     }
1777   %}
1778 
1779   enc_class Verify_FPU_For_Leaf %{
1780     if( VerifyFPU ) {
1781       MacroAssembler masm(&cbuf);
1782       masm.verify_FPU( -3, "Returning from Runtime Leaf call");
1783     }
1784   %}
1785 
1786   enc_class Java_To_Runtime (method meth) %{    // CALL Java_To_Runtime, Java_To_Runtime_Leaf
1787     // This is the instruction starting address for relocation info.
1788     cbuf.set_inst_mark();
1789     $$$emit8$primary;
1790     // CALL directly to the runtime
1791     emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.code_end()) - 4),
1792                 runtime_call_Relocation::spec(), RELOC_IMM32 );
1793 
1794     if (UseSSE >= 2) {
1795       MacroAssembler _masm(&cbuf);
1796       BasicType rt = tf()->return_type();
1797 
1798       if ((rt == T_FLOAT || rt == T_DOUBLE) && !return_value_is_used()) {
1799         // A C runtime call where the return value is unused.  In SSE2+
1800         // mode the result needs to be removed from the FPU stack.  It's
1801         // likely that this function call could be removed by the
1802         // optimizer if the C function is a pure function.
1803         __ ffree(0);
1804       } else if (rt == T_FLOAT) {
1805         __ lea(rsp, Address(rsp, -4));
1806         __ fstp_s(Address(rsp, 0));
1807         __ movflt(xmm0, Address(rsp, 0));
1808         __ lea(rsp, Address(rsp,  4));
1809       } else if (rt == T_DOUBLE) {
1810         __ lea(rsp, Address(rsp, -8));
1811         __ fstp_d(Address(rsp, 0));
1812         __ movdbl(xmm0, Address(rsp, 0));
1813         __ lea(rsp, Address(rsp,  8));
1814       }
1815     }
1816   %}
1817 
1818 
1819   enc_class pre_call_FPU %{
1820     // If method sets FPU control word restore it here
1821     debug_only(int off0 = cbuf.code_size());
1822     if( Compile::current()->in_24_bit_fp_mode() ) {
1823       MacroAssembler masm(&cbuf);
1824       masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
1825     }
1826     debug_only(int off1 = cbuf.code_size());
1827     assert(off1 - off0 == pre_call_FPU_size(), "correct size prediction");
1828   %}
1829 
1830   enc_class post_call_FPU %{
1831     // If method sets FPU control word do it here also
1832     if( Compile::current()->in_24_bit_fp_mode() ) {
1833       MacroAssembler masm(&cbuf);
1834       masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
1835     }
1836   %}
1837 
1838   enc_class preserve_SP %{
1839     debug_only(int off0 = cbuf.code_size());
1840     MacroAssembler _masm(&cbuf);
1841     // RBP is preserved across all calls, even compiled calls.
1842     // Use it to preserve RSP in places where the callee might change the SP.
1843     __ movptr(rbp, rsp);
1844     debug_only(int off1 = cbuf.code_size());
1845     assert(off1 - off0 == preserve_SP_size(), "correct size prediction");
1846   %}
1847 
1848   enc_class restore_SP %{
1849     MacroAssembler _masm(&cbuf);
1850     __ movptr(rsp, rbp);
1851   %}
1852 
1853   enc_class Java_Static_Call (method meth) %{    // JAVA STATIC CALL
1854     // CALL to fixup routine.  Fixup routine uses ScopeDesc info to determine
1855     // who we intended to call.
1856     cbuf.set_inst_mark();
1857     $$$emit8$primary;
1858     if ( !_method ) {
1859       emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.code_end()) - 4),
1860                      runtime_call_Relocation::spec(), RELOC_IMM32 );
1861     } else if(_optimized_virtual) {
1862       emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.code_end()) - 4),
1863                      opt_virtual_call_Relocation::spec(), RELOC_IMM32 );
1864     } else {
1865       emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.code_end()) - 4),
1866                      static_call_Relocation::spec(), RELOC_IMM32 );
1867     }
1868     if( _method ) {  // Emit stub for static call
1869       emit_java_to_interp(cbuf);
1870     }
1871   %}
1872 
1873   enc_class Java_Dynamic_Call (method meth) %{    // JAVA DYNAMIC CALL
1874     // !!!!!
1875     // Generate  "Mov EAX,0x00", placeholder instruction to load oop-info
1876     // emit_call_dynamic_prologue( cbuf );
1877     cbuf.set_inst_mark();
1878     emit_opcode(cbuf, 0xB8 + EAX_enc);        // mov    EAX,-1
1879     emit_d32_reloc(cbuf, (int)Universe::non_oop_word(), oop_Relocation::spec_for_immediate(), RELOC_IMM32);
1880     address  virtual_call_oop_addr = cbuf.inst_mark();
1881     // CALL to fixup routine.  Fixup routine uses ScopeDesc info to determine
1882     // who we intended to call.
1883     cbuf.set_inst_mark();
1884     $$$emit8$primary;
1885     emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.code_end()) - 4),
1886                 virtual_call_Relocation::spec(virtual_call_oop_addr), RELOC_IMM32 );
1887   %}
1888 
1889   enc_class Java_Compiled_Call (method meth) %{    // JAVA COMPILED CALL
1890     int disp = in_bytes(methodOopDesc::from_compiled_offset());
1891     assert( -128 <= disp && disp <= 127, "compiled_code_offset isn't small");
1892 
1893     // CALL *[EAX+in_bytes(methodOopDesc::from_compiled_code_entry_point_offset())]
1894     cbuf.set_inst_mark();
1895     $$$emit8$primary;
1896     emit_rm(cbuf, 0x01, $secondary, EAX_enc );  // R/M byte
1897     emit_d8(cbuf, disp);             // Displacement
1898 
1899   %}
1900 
1901   enc_class Xor_Reg (eRegI dst) %{
1902     emit_opcode(cbuf, 0x33);
1903     emit_rm(cbuf, 0x3, $dst$$reg, $dst$$reg);
1904   %}
1905 
1906 //   Following encoding is no longer used, but may be restored if calling
1907 //   convention changes significantly.
1908 //   Became: Xor_Reg(EBP), Java_To_Runtime( labl )
1909 //
1910 //   enc_class Java_Interpreter_Call (label labl) %{    // JAVA INTERPRETER CALL
1911 //     // int ic_reg     = Matcher::inline_cache_reg();
1912 //     // int ic_encode  = Matcher::_regEncode[ic_reg];
1913 //     // int imo_reg    = Matcher::interpreter_method_oop_reg();
1914 //     // int imo_encode = Matcher::_regEncode[imo_reg];
1915 //
1916 //     // // Interpreter expects method_oop in EBX, currently a callee-saved register,
1917 //     // // so we load it immediately before the call
1918 //     // emit_opcode(cbuf, 0x8B);                     // MOV    imo_reg,ic_reg  # method_oop
1919 //     // emit_rm(cbuf, 0x03, imo_encode, ic_encode ); // R/M byte
1920 //
1921 //     // xor rbp,ebp
1922 //     emit_opcode(cbuf, 0x33);
1923 //     emit_rm(cbuf, 0x3, EBP_enc, EBP_enc);
1924 //
1925 //     // CALL to interpreter.
1926 //     cbuf.set_inst_mark();
1927 //     $$$emit8$primary;
1928 //     emit_d32_reloc(cbuf, ($labl$$label - (int)(cbuf.code_end()) - 4),
1929 //                 runtime_call_Relocation::spec(), RELOC_IMM32 );
1930 //   %}
1931 
1932   enc_class RegOpcImm (eRegI dst, immI8 shift) %{    // SHL, SAR, SHR
1933     $$$emit8$primary;
1934     emit_rm(cbuf, 0x3, $secondary, $dst$$reg);
1935     $$$emit8$shift$$constant;
1936   %}
1937 
1938   enc_class LdImmI (eRegI dst, immI src) %{    // Load Immediate
1939     // Load immediate does not have a zero or sign extended version
1940     // for 8-bit immediates
1941     emit_opcode(cbuf, 0xB8 + $dst$$reg);
1942     $$$emit32$src$$constant;
1943   %}
1944 
1945   enc_class LdImmP (eRegI dst, immI src) %{    // Load Immediate
1946     // Load immediate does not have a zero or sign extended version
1947     // for 8-bit immediates
1948     emit_opcode(cbuf, $primary + $dst$$reg);
1949     $$$emit32$src$$constant;
1950   %}
1951 
1952   enc_class LdImmL_Lo( eRegL dst, immL src) %{    // Load Immediate
1953     // Load immediate does not have a zero or sign extended version
1954     // for 8-bit immediates
1955     int dst_enc = $dst$$reg;
1956     int src_con = $src$$constant & 0x0FFFFFFFFL;
1957     if (src_con == 0) {
1958       // xor dst, dst
1959       emit_opcode(cbuf, 0x33);
1960       emit_rm(cbuf, 0x3, dst_enc, dst_enc);
1961     } else {
1962       emit_opcode(cbuf, $primary + dst_enc);
1963       emit_d32(cbuf, src_con);
1964     }
1965   %}
1966 
1967   enc_class LdImmL_Hi( eRegL dst, immL src) %{    // Load Immediate
1968     // Load immediate does not have a zero or sign extended version
1969     // for 8-bit immediates
1970     int dst_enc = $dst$$reg + 2;
1971     int src_con = ((julong)($src$$constant)) >> 32;
1972     if (src_con == 0) {
1973       // xor dst, dst
1974       emit_opcode(cbuf, 0x33);
1975       emit_rm(cbuf, 0x3, dst_enc, dst_enc);
1976     } else {
1977       emit_opcode(cbuf, $primary + dst_enc);
1978       emit_d32(cbuf, src_con);
1979     }
1980   %}
1981 
1982 
1983   enc_class LdImmD (immD src) %{    // Load Immediate
1984     if( is_positive_zero_double($src$$constant)) {
1985       // FLDZ
1986       emit_opcode(cbuf,0xD9);
1987       emit_opcode(cbuf,0xEE);
1988     } else if( is_positive_one_double($src$$constant)) {
1989       // FLD1
1990       emit_opcode(cbuf,0xD9);
1991       emit_opcode(cbuf,0xE8);
1992     } else {
1993       emit_opcode(cbuf,0xDD);
1994       emit_rm(cbuf, 0x0, 0x0, 0x5);
1995       emit_double_constant(cbuf, $src$$constant);
1996     }
1997   %}
1998 
1999 
2000   enc_class LdImmF (immF src) %{    // Load Immediate
2001     if( is_positive_zero_float($src$$constant)) {
2002       emit_opcode(cbuf,0xD9);
2003       emit_opcode(cbuf,0xEE);
2004     } else if( is_positive_one_float($src$$constant)) {
2005       emit_opcode(cbuf,0xD9);
2006       emit_opcode(cbuf,0xE8);
2007     } else {
2008       $$$emit8$primary;
2009       // Load immediate does not have a zero or sign extended version
2010       // for 8-bit immediates
2011       // First load to TOS, then move to dst
2012       emit_rm(cbuf, 0x0, 0x0, 0x5);
2013       emit_float_constant(cbuf, $src$$constant);
2014     }
2015   %}
2016 
2017   enc_class LdImmX (regX dst, immXF con) %{    // Load Immediate
2018     emit_rm(cbuf, 0x0, $dst$$reg, 0x5);
2019     emit_float_constant(cbuf, $con$$constant);
2020   %}
2021 
2022   enc_class LdImmXD (regXD dst, immXD con) %{    // Load Immediate
2023     emit_rm(cbuf, 0x0, $dst$$reg, 0x5);
2024     emit_double_constant(cbuf, $con$$constant);
2025   %}
2026 
2027   enc_class load_conXD (regXD dst, immXD con) %{ // Load double constant
2028     // UseXmmLoadAndClearUpper ? movsd(dst, con) : movlpd(dst, con)
2029     emit_opcode(cbuf, UseXmmLoadAndClearUpper ? 0xF2 : 0x66);
2030     emit_opcode(cbuf, 0x0F);
2031     emit_opcode(cbuf, UseXmmLoadAndClearUpper ? 0x10 : 0x12);
2032     emit_rm(cbuf, 0x0, $dst$$reg, 0x5);
2033     emit_double_constant(cbuf, $con$$constant);
2034   %}
2035 
2036   enc_class Opc_MemImm_F(immF src) %{
2037     cbuf.set_inst_mark();
2038     $$$emit8$primary;
2039     emit_rm(cbuf, 0x0, $secondary, 0x5);
2040     emit_float_constant(cbuf, $src$$constant);
2041   %}
2042 
2043 
2044   enc_class MovI2X_reg(regX dst, eRegI src) %{
2045     emit_opcode(cbuf, 0x66 );     // MOVD dst,src
2046     emit_opcode(cbuf, 0x0F );
2047     emit_opcode(cbuf, 0x6E );
2048     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
2049   %}
2050 
2051   enc_class MovX2I_reg(eRegI dst, regX src) %{
2052     emit_opcode(cbuf, 0x66 );     // MOVD dst,src
2053     emit_opcode(cbuf, 0x0F );
2054     emit_opcode(cbuf, 0x7E );
2055     emit_rm(cbuf, 0x3, $src$$reg, $dst$$reg);
2056   %}
2057 
2058   enc_class MovL2XD_reg(regXD dst, eRegL src, regXD tmp) %{
2059     { // MOVD $dst,$src.lo
2060       emit_opcode(cbuf,0x66);
2061       emit_opcode(cbuf,0x0F);
2062       emit_opcode(cbuf,0x6E);
2063       emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
2064     }
2065     { // MOVD $tmp,$src.hi
2066       emit_opcode(cbuf,0x66);
2067       emit_opcode(cbuf,0x0F);
2068       emit_opcode(cbuf,0x6E);
2069       emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($src$$reg));
2070     }
2071     { // PUNPCKLDQ $dst,$tmp
2072       emit_opcode(cbuf,0x66);
2073       emit_opcode(cbuf,0x0F);
2074       emit_opcode(cbuf,0x62);
2075       emit_rm(cbuf, 0x3, $dst$$reg, $tmp$$reg);
2076      }
2077   %}
2078 
2079   enc_class MovXD2L_reg(eRegL dst, regXD src, regXD tmp) %{
2080     { // MOVD $dst.lo,$src
2081       emit_opcode(cbuf,0x66);
2082       emit_opcode(cbuf,0x0F);
2083       emit_opcode(cbuf,0x7E);
2084       emit_rm(cbuf, 0x3, $src$$reg, $dst$$reg);
2085     }
2086     { // PSHUFLW $tmp,$src,0x4E  (01001110b)
2087       emit_opcode(cbuf,0xF2);
2088       emit_opcode(cbuf,0x0F);
2089       emit_opcode(cbuf,0x70);
2090       emit_rm(cbuf, 0x3, $tmp$$reg, $src$$reg);
2091       emit_d8(cbuf, 0x4E);
2092     }
2093     { // MOVD $dst.hi,$tmp
2094       emit_opcode(cbuf,0x66);
2095       emit_opcode(cbuf,0x0F);
2096       emit_opcode(cbuf,0x7E);
2097       emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($dst$$reg));
2098     }
2099   %}
2100 
2101 
2102   // Encode a reg-reg copy.  If it is useless, then empty encoding.
2103   enc_class enc_Copy( eRegI dst, eRegI src ) %{
2104     encode_Copy( cbuf, $dst$$reg, $src$$reg );
2105   %}
2106 
2107   enc_class enc_CopyL_Lo( eRegI dst, eRegL src ) %{
2108     encode_Copy( cbuf, $dst$$reg, $src$$reg );
2109   %}
2110 
2111   // Encode xmm reg-reg copy.  If it is useless, then empty encoding.
2112   enc_class enc_CopyXD( RegXD dst, RegXD src ) %{
2113     encode_CopyXD( cbuf, $dst$$reg, $src$$reg );
2114   %}
2115 
2116   enc_class RegReg (eRegI dst, eRegI src) %{    // RegReg(Many)
2117     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
2118   %}
2119 
2120   enc_class RegReg_Lo(eRegL dst, eRegL src) %{    // RegReg(Many)
2121     $$$emit8$primary;
2122     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
2123   %}
2124 
2125   enc_class RegReg_Hi(eRegL dst, eRegL src) %{    // RegReg(Many)
2126     $$$emit8$secondary;
2127     emit_rm(cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), HIGH_FROM_LOW($src$$reg));
2128   %}
2129 
2130   enc_class RegReg_Lo2(eRegL dst, eRegL src) %{    // RegReg(Many)
2131     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
2132   %}
2133 
2134   enc_class RegReg_Hi2(eRegL dst, eRegL src) %{    // RegReg(Many)
2135     emit_rm(cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), HIGH_FROM_LOW($src$$reg));
2136   %}
2137 
2138   enc_class RegReg_HiLo( eRegL src, eRegI dst ) %{
2139     emit_rm(cbuf, 0x3, $dst$$reg, HIGH_FROM_LOW($src$$reg));
2140   %}
2141 
2142   enc_class Con32 (immI src) %{    // Con32(storeImmI)
2143     // Output immediate
2144     $$$emit32$src$$constant;
2145   %}
2146 
2147   enc_class Con32F_as_bits(immF src) %{        // storeF_imm
2148     // Output Float immediate bits
2149     jfloat jf = $src$$constant;
2150     int    jf_as_bits = jint_cast( jf );
2151     emit_d32(cbuf, jf_as_bits);
2152   %}
2153 
2154   enc_class Con32XF_as_bits(immXF src) %{      // storeX_imm
2155     // Output Float immediate bits
2156     jfloat jf = $src$$constant;
2157     int    jf_as_bits = jint_cast( jf );
2158     emit_d32(cbuf, jf_as_bits);
2159   %}
2160 
2161   enc_class Con16 (immI src) %{    // Con16(storeImmI)
2162     // Output immediate
2163     $$$emit16$src$$constant;
2164   %}
2165 
2166   enc_class Con_d32(immI src) %{
2167     emit_d32(cbuf,$src$$constant);
2168   %}
2169 
2170   enc_class conmemref (eRegP t1) %{    // Con32(storeImmI)
2171     // Output immediate memory reference
2172     emit_rm(cbuf, 0x00, $t1$$reg, 0x05 );
2173     emit_d32(cbuf, 0x00);
2174   %}
2175 
2176   enc_class lock_prefix( ) %{
2177     if( os::is_MP() )
2178       emit_opcode(cbuf,0xF0);         // [Lock]
2179   %}
2180 
2181   // Cmp-xchg long value.
2182   // Note: we need to swap rbx, and rcx before and after the
2183   //       cmpxchg8 instruction because the instruction uses
2184   //       rcx as the high order word of the new value to store but
2185   //       our register encoding uses rbx,.
2186   enc_class enc_cmpxchg8(eSIRegP mem_ptr) %{
2187 
2188     // XCHG  rbx,ecx
2189     emit_opcode(cbuf,0x87);
2190     emit_opcode(cbuf,0xD9);
2191     // [Lock]
2192     if( os::is_MP() )
2193       emit_opcode(cbuf,0xF0);
2194     // CMPXCHG8 [Eptr]
2195     emit_opcode(cbuf,0x0F);
2196     emit_opcode(cbuf,0xC7);
2197     emit_rm( cbuf, 0x0, 1, $mem_ptr$$reg );
2198     // XCHG  rbx,ecx
2199     emit_opcode(cbuf,0x87);
2200     emit_opcode(cbuf,0xD9);
2201   %}
2202 
2203   enc_class enc_cmpxchg(eSIRegP mem_ptr) %{
2204     // [Lock]
2205     if( os::is_MP() )
2206       emit_opcode(cbuf,0xF0);
2207 
2208     // CMPXCHG [Eptr]
2209     emit_opcode(cbuf,0x0F);
2210     emit_opcode(cbuf,0xB1);
2211     emit_rm( cbuf, 0x0, 1, $mem_ptr$$reg );
2212   %}
2213 
2214   enc_class enc_flags_ne_to_boolean( iRegI res ) %{
2215     int res_encoding = $res$$reg;
2216 
2217     // MOV  res,0
2218     emit_opcode( cbuf, 0xB8 + res_encoding);
2219     emit_d32( cbuf, 0 );
2220     // JNE,s  fail
2221     emit_opcode(cbuf,0x75);
2222     emit_d8(cbuf, 5 );
2223     // MOV  res,1
2224     emit_opcode( cbuf, 0xB8 + res_encoding);
2225     emit_d32( cbuf, 1 );
2226     // fail:
2227   %}
2228 
2229   enc_class set_instruction_start( ) %{
2230     cbuf.set_inst_mark();            // Mark start of opcode for reloc info in mem operand
2231   %}
2232 
2233   enc_class RegMem (eRegI ereg, memory mem) %{    // emit_reg_mem
2234     int reg_encoding = $ereg$$reg;
2235     int base  = $mem$$base;
2236     int index = $mem$$index;
2237     int scale = $mem$$scale;
2238     int displace = $mem$$disp;
2239     bool disp_is_oop = $mem->disp_is_oop();
2240     encode_RegMem(cbuf, reg_encoding, base, index, scale, displace, disp_is_oop);
2241   %}
2242 
2243   enc_class RegMem_Hi(eRegL ereg, memory mem) %{    // emit_reg_mem
2244     int reg_encoding = HIGH_FROM_LOW($ereg$$reg);  // Hi register of pair, computed from lo
2245     int base  = $mem$$base;
2246     int index = $mem$$index;
2247     int scale = $mem$$scale;
2248     int displace = $mem$$disp + 4;      // Offset is 4 further in memory
2249     assert( !$mem->disp_is_oop(), "Cannot add 4 to oop" );
2250     encode_RegMem(cbuf, reg_encoding, base, index, scale, displace, false/*disp_is_oop*/);
2251   %}
2252 
2253   enc_class move_long_small_shift( eRegL dst, immI_1_31 cnt ) %{
2254     int r1, r2;
2255     if( $tertiary == 0xA4 ) { r1 = $dst$$reg;  r2 = HIGH_FROM_LOW($dst$$reg); }
2256     else                    { r2 = $dst$$reg;  r1 = HIGH_FROM_LOW($dst$$reg); }
2257     emit_opcode(cbuf,0x0F);
2258     emit_opcode(cbuf,$tertiary);
2259     emit_rm(cbuf, 0x3, r1, r2);
2260     emit_d8(cbuf,$cnt$$constant);
2261     emit_d8(cbuf,$primary);
2262     emit_rm(cbuf, 0x3, $secondary, r1);
2263     emit_d8(cbuf,$cnt$$constant);
2264   %}
2265 
2266   enc_class move_long_big_shift_sign( eRegL dst, immI_32_63 cnt ) %{
2267     emit_opcode( cbuf, 0x8B ); // Move
2268     emit_rm(cbuf, 0x3, $dst$$reg, HIGH_FROM_LOW($dst$$reg));
2269     emit_d8(cbuf,$primary);
2270     emit_rm(cbuf, 0x3, $secondary, $dst$$reg);
2271     emit_d8(cbuf,$cnt$$constant-32);
2272     emit_d8(cbuf,$primary);
2273     emit_rm(cbuf, 0x3, $secondary, HIGH_FROM_LOW($dst$$reg));
2274     emit_d8(cbuf,31);
2275   %}
2276 
2277   enc_class move_long_big_shift_clr( eRegL dst, immI_32_63 cnt ) %{
2278     int r1, r2;
2279     if( $secondary == 0x5 ) { r1 = $dst$$reg;  r2 = HIGH_FROM_LOW($dst$$reg); }
2280     else                    { r2 = $dst$$reg;  r1 = HIGH_FROM_LOW($dst$$reg); }
2281 
2282     emit_opcode( cbuf, 0x8B ); // Move r1,r2
2283     emit_rm(cbuf, 0x3, r1, r2);
2284     if( $cnt$$constant > 32 ) { // Shift, if not by zero
2285       emit_opcode(cbuf,$primary);
2286       emit_rm(cbuf, 0x3, $secondary, r1);
2287       emit_d8(cbuf,$cnt$$constant-32);
2288     }
2289     emit_opcode(cbuf,0x33);  // XOR r2,r2
2290     emit_rm(cbuf, 0x3, r2, r2);
2291   %}
2292 
2293   // Clone of RegMem but accepts an extra parameter to access each
2294   // half of a double in memory; it never needs relocation info.
2295   enc_class Mov_MemD_half_to_Reg (immI opcode, memory mem, immI disp_for_half, eRegI rm_reg) %{
2296     emit_opcode(cbuf,$opcode$$constant);
2297     int reg_encoding = $rm_reg$$reg;
2298     int base     = $mem$$base;
2299     int index    = $mem$$index;
2300     int scale    = $mem$$scale;
2301     int displace = $mem$$disp + $disp_for_half$$constant;
2302     bool disp_is_oop = false;
2303     encode_RegMem(cbuf, reg_encoding, base, index, scale, displace, disp_is_oop);
2304   %}
2305 
2306   // !!!!! Special Custom Code used by MemMove, and stack access instructions !!!!!
2307   //
2308   // Clone of RegMem except the RM-byte's reg/opcode field is an ADLC-time constant
2309   // and it never needs relocation information.
2310   // Frequently used to move data between FPU's Stack Top and memory.
2311   enc_class RMopc_Mem_no_oop (immI rm_opcode, memory mem) %{
2312     int rm_byte_opcode = $rm_opcode$$constant;
2313     int base     = $mem$$base;
2314     int index    = $mem$$index;
2315     int scale    = $mem$$scale;
2316     int displace = $mem$$disp;
2317     assert( !$mem->disp_is_oop(), "No oops here because no relo info allowed" );
2318     encode_RegMem(cbuf, rm_byte_opcode, base, index, scale, displace, false);
2319   %}
2320 
2321   enc_class RMopc_Mem (immI rm_opcode, memory mem) %{
2322     int rm_byte_opcode = $rm_opcode$$constant;
2323     int base     = $mem$$base;
2324     int index    = $mem$$index;
2325     int scale    = $mem$$scale;
2326     int displace = $mem$$disp;
2327     bool disp_is_oop = $mem->disp_is_oop(); // disp-as-oop when working with static globals
2328     encode_RegMem(cbuf, rm_byte_opcode, base, index, scale, displace, disp_is_oop);
2329   %}
2330 
2331   enc_class RegLea (eRegI dst, eRegI src0, immI src1 ) %{    // emit_reg_lea
2332     int reg_encoding = $dst$$reg;
2333     int base         = $src0$$reg;      // 0xFFFFFFFF indicates no base
2334     int index        = 0x04;            // 0x04 indicates no index
2335     int scale        = 0x00;            // 0x00 indicates no scale
2336     int displace     = $src1$$constant; // 0x00 indicates no displacement
2337     bool disp_is_oop = false;
2338     encode_RegMem(cbuf, reg_encoding, base, index, scale, displace, disp_is_oop);
2339   %}
2340 
2341   enc_class min_enc (eRegI dst, eRegI src) %{    // MIN
2342     // Compare dst,src
2343     emit_opcode(cbuf,0x3B);
2344     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
2345     // jmp dst < src around move
2346     emit_opcode(cbuf,0x7C);
2347     emit_d8(cbuf,2);
2348     // move dst,src
2349     emit_opcode(cbuf,0x8B);
2350     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
2351   %}
2352 
2353   enc_class max_enc (eRegI dst, eRegI src) %{    // MAX
2354     // Compare dst,src
2355     emit_opcode(cbuf,0x3B);
2356     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
2357     // jmp dst > src around move
2358     emit_opcode(cbuf,0x7F);
2359     emit_d8(cbuf,2);
2360     // move dst,src
2361     emit_opcode(cbuf,0x8B);
2362     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
2363   %}
2364 
2365   enc_class enc_FP_store(memory mem, regD src) %{
2366     // If src is FPR1, we can just FST to store it.
2367     // Else we need to FLD it to FPR1, then FSTP to store/pop it.
2368     int reg_encoding = 0x2; // Just store
2369     int base  = $mem$$base;
2370     int index = $mem$$index;
2371     int scale = $mem$$scale;
2372     int displace = $mem$$disp;
2373     bool disp_is_oop = $mem->disp_is_oop(); // disp-as-oop when working with static globals
2374     if( $src$$reg != FPR1L_enc ) {
2375       reg_encoding = 0x3;  // Store & pop
2376       emit_opcode( cbuf, 0xD9 ); // FLD (i.e., push it)
2377       emit_d8( cbuf, 0xC0-1+$src$$reg );
2378     }
2379     cbuf.set_inst_mark();       // Mark start of opcode for reloc info in mem operand
2380     emit_opcode(cbuf,$primary);
2381     encode_RegMem(cbuf, reg_encoding, base, index, scale, displace, disp_is_oop);
2382   %}
2383 
2384   enc_class neg_reg(eRegI dst) %{
2385     // NEG $dst
2386     emit_opcode(cbuf,0xF7);
2387     emit_rm(cbuf, 0x3, 0x03, $dst$$reg );
2388   %}
2389 
2390   enc_class setLT_reg(eCXRegI dst) %{
2391     // SETLT $dst
2392     emit_opcode(cbuf,0x0F);
2393     emit_opcode(cbuf,0x9C);
2394     emit_rm( cbuf, 0x3, 0x4, $dst$$reg );
2395   %}
2396 
2397   enc_class enc_cmpLTP(ncxRegI p, ncxRegI q, ncxRegI y, eCXRegI tmp) %{    // cadd_cmpLT
2398     int tmpReg = $tmp$$reg;
2399 
2400     // SUB $p,$q
2401     emit_opcode(cbuf,0x2B);
2402     emit_rm(cbuf, 0x3, $p$$reg, $q$$reg);
2403     // SBB $tmp,$tmp
2404     emit_opcode(cbuf,0x1B);
2405     emit_rm(cbuf, 0x3, tmpReg, tmpReg);
2406     // AND $tmp,$y
2407     emit_opcode(cbuf,0x23);
2408     emit_rm(cbuf, 0x3, tmpReg, $y$$reg);
2409     // ADD $p,$tmp
2410     emit_opcode(cbuf,0x03);
2411     emit_rm(cbuf, 0x3, $p$$reg, tmpReg);
2412   %}
2413 
2414   enc_class enc_cmpLTP_mem(eRegI p, eRegI q, memory mem, eCXRegI tmp) %{    // cadd_cmpLT
2415     int tmpReg = $tmp$$reg;
2416 
2417     // SUB $p,$q
2418     emit_opcode(cbuf,0x2B);
2419     emit_rm(cbuf, 0x3, $p$$reg, $q$$reg);
2420     // SBB $tmp,$tmp
2421     emit_opcode(cbuf,0x1B);
2422     emit_rm(cbuf, 0x3, tmpReg, tmpReg);
2423     // AND $tmp,$y
2424     cbuf.set_inst_mark();       // Mark start of opcode for reloc info in mem operand
2425     emit_opcode(cbuf,0x23);
2426     int reg_encoding = tmpReg;
2427     int base  = $mem$$base;
2428     int index = $mem$$index;
2429     int scale = $mem$$scale;
2430     int displace = $mem$$disp;
2431     bool disp_is_oop = $mem->disp_is_oop();
2432     encode_RegMem(cbuf, reg_encoding, base, index, scale, displace, disp_is_oop);
2433     // ADD $p,$tmp
2434     emit_opcode(cbuf,0x03);
2435     emit_rm(cbuf, 0x3, $p$$reg, tmpReg);
2436   %}
2437 
2438   enc_class shift_left_long( eRegL dst, eCXRegI shift ) %{
2439     // TEST shift,32
2440     emit_opcode(cbuf,0xF7);
2441     emit_rm(cbuf, 0x3, 0, ECX_enc);
2442     emit_d32(cbuf,0x20);
2443     // JEQ,s small
2444     emit_opcode(cbuf, 0x74);
2445     emit_d8(cbuf, 0x04);
2446     // MOV    $dst.hi,$dst.lo
2447     emit_opcode( cbuf, 0x8B );
2448     emit_rm(cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), $dst$$reg );
2449     // CLR    $dst.lo
2450     emit_opcode(cbuf, 0x33);
2451     emit_rm(cbuf, 0x3, $dst$$reg, $dst$$reg);
2452 // small:
2453     // SHLD   $dst.hi,$dst.lo,$shift
2454     emit_opcode(cbuf,0x0F);
2455     emit_opcode(cbuf,0xA5);
2456     emit_rm(cbuf, 0x3, $dst$$reg, HIGH_FROM_LOW($dst$$reg));
2457     // SHL    $dst.lo,$shift"
2458     emit_opcode(cbuf,0xD3);
2459     emit_rm(cbuf, 0x3, 0x4, $dst$$reg );
2460   %}
2461 
2462   enc_class shift_right_long( eRegL dst, eCXRegI shift ) %{
2463     // TEST shift,32
2464     emit_opcode(cbuf,0xF7);
2465     emit_rm(cbuf, 0x3, 0, ECX_enc);
2466     emit_d32(cbuf,0x20);
2467     // JEQ,s small
2468     emit_opcode(cbuf, 0x74);
2469     emit_d8(cbuf, 0x04);
2470     // MOV    $dst.lo,$dst.hi
2471     emit_opcode( cbuf, 0x8B );
2472     emit_rm(cbuf, 0x3, $dst$$reg, HIGH_FROM_LOW($dst$$reg) );
2473     // CLR    $dst.hi
2474     emit_opcode(cbuf, 0x33);
2475     emit_rm(cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), HIGH_FROM_LOW($dst$$reg));
2476 // small:
2477     // SHRD   $dst.lo,$dst.hi,$shift
2478     emit_opcode(cbuf,0x0F);
2479     emit_opcode(cbuf,0xAD);
2480     emit_rm(cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), $dst$$reg);
2481     // SHR    $dst.hi,$shift"
2482     emit_opcode(cbuf,0xD3);
2483     emit_rm(cbuf, 0x3, 0x5, HIGH_FROM_LOW($dst$$reg) );
2484   %}
2485 
2486   enc_class shift_right_arith_long( eRegL dst, eCXRegI shift ) %{
2487     // TEST shift,32
2488     emit_opcode(cbuf,0xF7);
2489     emit_rm(cbuf, 0x3, 0, ECX_enc);
2490     emit_d32(cbuf,0x20);
2491     // JEQ,s small
2492     emit_opcode(cbuf, 0x74);
2493     emit_d8(cbuf, 0x05);
2494     // MOV    $dst.lo,$dst.hi
2495     emit_opcode( cbuf, 0x8B );
2496     emit_rm(cbuf, 0x3, $dst$$reg, HIGH_FROM_LOW($dst$$reg) );
2497     // SAR    $dst.hi,31
2498     emit_opcode(cbuf, 0xC1);
2499     emit_rm(cbuf, 0x3, 7, HIGH_FROM_LOW($dst$$reg) );
2500     emit_d8(cbuf, 0x1F );
2501 // small:
2502     // SHRD   $dst.lo,$dst.hi,$shift
2503     emit_opcode(cbuf,0x0F);
2504     emit_opcode(cbuf,0xAD);
2505     emit_rm(cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), $dst$$reg);
2506     // SAR    $dst.hi,$shift"
2507     emit_opcode(cbuf,0xD3);
2508     emit_rm(cbuf, 0x3, 0x7, HIGH_FROM_LOW($dst$$reg) );
2509   %}
2510 
2511 
2512   // ----------------- Encodings for floating point unit -----------------
2513   // May leave result in FPU-TOS or FPU reg depending on opcodes
2514   enc_class OpcReg_F (regF src) %{    // FMUL, FDIV
2515     $$$emit8$primary;
2516     emit_rm(cbuf, 0x3, $secondary, $src$$reg );
2517   %}
2518 
2519   // Pop argument in FPR0 with FSTP ST(0)
2520   enc_class PopFPU() %{
2521     emit_opcode( cbuf, 0xDD );
2522     emit_d8( cbuf, 0xD8 );
2523   %}
2524 
2525   // !!!!! equivalent to Pop_Reg_F
2526   enc_class Pop_Reg_D( regD dst ) %{
2527     emit_opcode( cbuf, 0xDD );           // FSTP   ST(i)
2528     emit_d8( cbuf, 0xD8+$dst$$reg );
2529   %}
2530 
2531   enc_class Push_Reg_D( regD dst ) %{
2532     emit_opcode( cbuf, 0xD9 );
2533     emit_d8( cbuf, 0xC0-1+$dst$$reg );   // FLD ST(i-1)
2534   %}
2535 
2536   enc_class strictfp_bias1( regD dst ) %{
2537     emit_opcode( cbuf, 0xDB );           // FLD m80real
2538     emit_opcode( cbuf, 0x2D );
2539     emit_d32( cbuf, (int)StubRoutines::addr_fpu_subnormal_bias1() );
2540     emit_opcode( cbuf, 0xDE );           // FMULP ST(dst), ST0
2541     emit_opcode( cbuf, 0xC8+$dst$$reg );
2542   %}
2543 
2544   enc_class strictfp_bias2( regD dst ) %{
2545     emit_opcode( cbuf, 0xDB );           // FLD m80real
2546     emit_opcode( cbuf, 0x2D );
2547     emit_d32( cbuf, (int)StubRoutines::addr_fpu_subnormal_bias2() );
2548     emit_opcode( cbuf, 0xDE );           // FMULP ST(dst), ST0
2549     emit_opcode( cbuf, 0xC8+$dst$$reg );
2550   %}
2551 
2552   // Special case for moving an integer register to a stack slot.
2553   enc_class OpcPRegSS( stackSlotI dst, eRegI src ) %{ // RegSS
2554     store_to_stackslot( cbuf, $primary, $src$$reg, $dst$$disp );
2555   %}
2556 
2557   // Special case for moving a register to a stack slot.
2558   enc_class RegSS( stackSlotI dst, eRegI src ) %{ // RegSS
2559     // Opcode already emitted
2560     emit_rm( cbuf, 0x02, $src$$reg, ESP_enc );   // R/M byte
2561     emit_rm( cbuf, 0x00, ESP_enc, ESP_enc);          // SIB byte
2562     emit_d32(cbuf, $dst$$disp);   // Displacement
2563   %}
2564 
2565   // Push the integer in stackSlot 'src' onto FP-stack
2566   enc_class Push_Mem_I( memory src ) %{    // FILD   [ESP+src]
2567     store_to_stackslot( cbuf, $primary, $secondary, $src$$disp );
2568   %}
2569 
2570   // Push the float in stackSlot 'src' onto FP-stack
2571   enc_class Push_Mem_F( memory src ) %{    // FLD_S   [ESP+src]
2572     store_to_stackslot( cbuf, 0xD9, 0x00, $src$$disp );
2573   %}
2574 
2575   // Push the double in stackSlot 'src' onto FP-stack
2576   enc_class Push_Mem_D( memory src ) %{    // FLD_D   [ESP+src]
2577     store_to_stackslot( cbuf, 0xDD, 0x00, $src$$disp );
2578   %}
2579 
2580   // Push FPU's TOS float to a stack-slot, and pop FPU-stack
2581   enc_class Pop_Mem_F( stackSlotF dst ) %{ // FSTP_S [ESP+dst]
2582     store_to_stackslot( cbuf, 0xD9, 0x03, $dst$$disp );
2583   %}
2584 
2585   // Same as Pop_Mem_F except for opcode
2586   // Push FPU's TOS double to a stack-slot, and pop FPU-stack
2587   enc_class Pop_Mem_D( stackSlotD dst ) %{ // FSTP_D [ESP+dst]
2588     store_to_stackslot( cbuf, 0xDD, 0x03, $dst$$disp );
2589   %}
2590 
2591   enc_class Pop_Reg_F( regF dst ) %{
2592     emit_opcode( cbuf, 0xDD );           // FSTP   ST(i)
2593     emit_d8( cbuf, 0xD8+$dst$$reg );
2594   %}
2595 
2596   enc_class Push_Reg_F( regF dst ) %{
2597     emit_opcode( cbuf, 0xD9 );           // FLD    ST(i-1)
2598     emit_d8( cbuf, 0xC0-1+$dst$$reg );
2599   %}
2600 
2601   // Push FPU's float to a stack-slot, and pop FPU-stack
2602   enc_class Pop_Mem_Reg_F( stackSlotF dst, regF src ) %{
2603     int pop = 0x02;
2604     if ($src$$reg != FPR1L_enc) {
2605       emit_opcode( cbuf, 0xD9 );         // FLD    ST(i-1)
2606       emit_d8( cbuf, 0xC0-1+$src$$reg );
2607       pop = 0x03;
2608     }
2609     store_to_stackslot( cbuf, 0xD9, pop, $dst$$disp ); // FST<P>_S  [ESP+dst]
2610   %}
2611 
2612   // Push FPU's double to a stack-slot, and pop FPU-stack
2613   enc_class Pop_Mem_Reg_D( stackSlotD dst, regD src ) %{
2614     int pop = 0x02;
2615     if ($src$$reg != FPR1L_enc) {
2616       emit_opcode( cbuf, 0xD9 );         // FLD    ST(i-1)
2617       emit_d8( cbuf, 0xC0-1+$src$$reg );
2618       pop = 0x03;
2619     }
2620     store_to_stackslot( cbuf, 0xDD, pop, $dst$$disp ); // FST<P>_D  [ESP+dst]
2621   %}
2622 
2623   // Push FPU's double to a FPU-stack-slot, and pop FPU-stack
2624   enc_class Pop_Reg_Reg_D( regD dst, regF src ) %{
2625     int pop = 0xD0 - 1; // -1 since we skip FLD
2626     if ($src$$reg != FPR1L_enc) {
2627       emit_opcode( cbuf, 0xD9 );         // FLD    ST(src-1)
2628       emit_d8( cbuf, 0xC0-1+$src$$reg );
2629       pop = 0xD8;
2630     }
2631     emit_opcode( cbuf, 0xDD );
2632     emit_d8( cbuf, pop+$dst$$reg );      // FST<P> ST(i)
2633   %}
2634 
2635 
2636   enc_class Mul_Add_F( regF dst, regF src, regF src1, regF src2 ) %{
2637     MacroAssembler masm(&cbuf);
2638     masm.fld_s(  $src1$$reg-1);   // nothing at TOS, load TOS from src1.reg
2639     masm.fmul(   $src2$$reg+0);   // value at TOS
2640     masm.fadd(   $src$$reg+0);    // value at TOS
2641     masm.fstp_d( $dst$$reg+0);    // value at TOS, popped off after store
2642   %}
2643 
2644 
2645   enc_class Push_Reg_Mod_D( regD dst, regD src) %{
2646     // load dst in FPR0
2647     emit_opcode( cbuf, 0xD9 );
2648     emit_d8( cbuf, 0xC0-1+$dst$$reg );
2649     if ($src$$reg != FPR1L_enc) {
2650       // fincstp
2651       emit_opcode (cbuf, 0xD9);
2652       emit_opcode (cbuf, 0xF7);
2653       // swap src with FPR1:
2654       // FXCH FPR1 with src
2655       emit_opcode(cbuf, 0xD9);
2656       emit_d8(cbuf, 0xC8-1+$src$$reg );
2657       // fdecstp
2658       emit_opcode (cbuf, 0xD9);
2659       emit_opcode (cbuf, 0xF6);
2660     }
2661   %}
2662 
2663   enc_class Push_ModD_encoding( regXD src0, regXD src1) %{
2664     // Allocate a word
2665     emit_opcode(cbuf,0x83);            // SUB ESP,8
2666     emit_opcode(cbuf,0xEC);
2667     emit_d8(cbuf,0x08);
2668 
2669     emit_opcode  (cbuf, 0xF2 );     // MOVSD [ESP], src1
2670     emit_opcode  (cbuf, 0x0F );
2671     emit_opcode  (cbuf, 0x11 );
2672     encode_RegMem(cbuf, $src1$$reg, ESP_enc, 0x4, 0, 0, false);
2673 
2674     emit_opcode(cbuf,0xDD );      // FLD_D [ESP]
2675     encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
2676 
2677     emit_opcode  (cbuf, 0xF2 );     // MOVSD [ESP], src0
2678     emit_opcode  (cbuf, 0x0F );
2679     emit_opcode  (cbuf, 0x11 );
2680     encode_RegMem(cbuf, $src0$$reg, ESP_enc, 0x4, 0, 0, false);
2681 
2682     emit_opcode(cbuf,0xDD );      // FLD_D [ESP]
2683     encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
2684 
2685   %}
2686 
2687   enc_class Push_ModX_encoding( regX src0, regX src1) %{
2688     // Allocate a word
2689     emit_opcode(cbuf,0x83);            // SUB ESP,4
2690     emit_opcode(cbuf,0xEC);
2691     emit_d8(cbuf,0x04);
2692 
2693     emit_opcode  (cbuf, 0xF3 );     // MOVSS [ESP], src1
2694     emit_opcode  (cbuf, 0x0F );
2695     emit_opcode  (cbuf, 0x11 );
2696     encode_RegMem(cbuf, $src1$$reg, ESP_enc, 0x4, 0, 0, false);
2697 
2698     emit_opcode(cbuf,0xD9 );      // FLD [ESP]
2699     encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
2700 
2701     emit_opcode  (cbuf, 0xF3 );     // MOVSS [ESP], src0
2702     emit_opcode  (cbuf, 0x0F );
2703     emit_opcode  (cbuf, 0x11 );
2704     encode_RegMem(cbuf, $src0$$reg, ESP_enc, 0x4, 0, 0, false);
2705 
2706     emit_opcode(cbuf,0xD9 );      // FLD [ESP]
2707     encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
2708 
2709   %}
2710 
2711   enc_class Push_ResultXD(regXD dst) %{
2712     store_to_stackslot( cbuf, 0xDD, 0x03, 0 ); //FSTP [ESP]
2713 
2714     // UseXmmLoadAndClearUpper ? movsd dst,[esp] : movlpd dst,[esp]
2715     emit_opcode  (cbuf, UseXmmLoadAndClearUpper ? 0xF2 : 0x66);
2716     emit_opcode  (cbuf, 0x0F );
2717     emit_opcode  (cbuf, UseXmmLoadAndClearUpper ? 0x10 : 0x12);
2718     encode_RegMem(cbuf, $dst$$reg, ESP_enc, 0x4, 0, 0, false);
2719 
2720     emit_opcode(cbuf,0x83);    // ADD ESP,8
2721     emit_opcode(cbuf,0xC4);
2722     emit_d8(cbuf,0x08);
2723   %}
2724 
2725   enc_class Push_ResultX(regX dst, immI d8) %{
2726     store_to_stackslot( cbuf, 0xD9, 0x03, 0 ); //FSTP_S [ESP]
2727 
2728     emit_opcode  (cbuf, 0xF3 );     // MOVSS dst(xmm), [ESP]
2729     emit_opcode  (cbuf, 0x0F );
2730     emit_opcode  (cbuf, 0x10 );
2731     encode_RegMem(cbuf, $dst$$reg, ESP_enc, 0x4, 0, 0, false);
2732 
2733     emit_opcode(cbuf,0x83);    // ADD ESP,d8 (4 or 8)
2734     emit_opcode(cbuf,0xC4);
2735     emit_d8(cbuf,$d8$$constant);
2736   %}
2737 
2738   enc_class Push_SrcXD(regXD src) %{
2739     // Allocate a word
2740     emit_opcode(cbuf,0x83);            // SUB ESP,8
2741     emit_opcode(cbuf,0xEC);
2742     emit_d8(cbuf,0x08);
2743 
2744     emit_opcode  (cbuf, 0xF2 );     // MOVSD [ESP], src
2745     emit_opcode  (cbuf, 0x0F );
2746     emit_opcode  (cbuf, 0x11 );
2747     encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
2748 
2749     emit_opcode(cbuf,0xDD );      // FLD_D [ESP]
2750     encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
2751   %}
2752 
2753   enc_class push_stack_temp_qword() %{
2754     emit_opcode(cbuf,0x83);     // SUB ESP,8
2755     emit_opcode(cbuf,0xEC);
2756     emit_d8    (cbuf,0x08);
2757   %}
2758 
2759   enc_class pop_stack_temp_qword() %{
2760     emit_opcode(cbuf,0x83);     // ADD ESP,8
2761     emit_opcode(cbuf,0xC4);
2762     emit_d8    (cbuf,0x08);
2763   %}
2764 
2765   enc_class push_xmm_to_fpr1( regXD xmm_src ) %{
2766     emit_opcode  (cbuf, 0xF2 );     // MOVSD [ESP], xmm_src
2767     emit_opcode  (cbuf, 0x0F );
2768     emit_opcode  (cbuf, 0x11 );
2769     encode_RegMem(cbuf, $xmm_src$$reg, ESP_enc, 0x4, 0, 0, false);
2770 
2771     emit_opcode(cbuf,0xDD );      // FLD_D [ESP]
2772     encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
2773   %}
2774 
2775   // Compute X^Y using Intel's fast hardware instructions, if possible.
2776   // Otherwise return a NaN.
2777   enc_class pow_exp_core_encoding %{
2778     // FPR1 holds Y*ln2(X).  Compute FPR1 = 2^(Y*ln2(X))
2779     emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xC0);  // fdup = fld st(0)          Q       Q
2780     emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xFC);  // frndint               int(Q)      Q
2781     emit_opcode(cbuf,0xDC); emit_opcode(cbuf,0xE9);  // fsub st(1) -= st(0);  int(Q) frac(Q)
2782     emit_opcode(cbuf,0xDB);                          // FISTP [ESP]           frac(Q)
2783     emit_opcode(cbuf,0x1C);
2784     emit_d8(cbuf,0x24);
2785     emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xF0);  // f2xm1                 2^frac(Q)-1
2786     emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xE8);  // fld1                  1 2^frac(Q)-1
2787     emit_opcode(cbuf,0xDE); emit_opcode(cbuf,0xC1);  // faddp                 2^frac(Q)
2788     emit_opcode(cbuf,0x8B);                          // mov rax,[esp+0]=int(Q)
2789     encode_RegMem(cbuf, EAX_enc, ESP_enc, 0x4, 0, 0, false);
2790     emit_opcode(cbuf,0xC7);                          // mov rcx,0xFFFFF800 - overflow mask
2791     emit_rm(cbuf, 0x3, 0x0, ECX_enc);
2792     emit_d32(cbuf,0xFFFFF800);
2793     emit_opcode(cbuf,0x81);                          // add rax,1023 - the double exponent bias
2794     emit_rm(cbuf, 0x3, 0x0, EAX_enc);
2795     emit_d32(cbuf,1023);
2796     emit_opcode(cbuf,0x8B);                          // mov rbx,eax
2797     emit_rm(cbuf, 0x3, EBX_enc, EAX_enc);
2798     emit_opcode(cbuf,0xC1);                          // shl rax,20 - Slide to exponent position
2799     emit_rm(cbuf,0x3,0x4,EAX_enc);
2800     emit_d8(cbuf,20);
2801     emit_opcode(cbuf,0x85);                          // test rbx,ecx - check for overflow
2802     emit_rm(cbuf, 0x3, EBX_enc, ECX_enc);
2803     emit_opcode(cbuf,0x0F); emit_opcode(cbuf,0x45);  // CMOVne rax,ecx - overflow; stuff NAN into EAX
2804     emit_rm(cbuf, 0x3, EAX_enc, ECX_enc);
2805     emit_opcode(cbuf,0x89);                          // mov [esp+4],eax - Store as part of double word
2806     encode_RegMem(cbuf, EAX_enc, ESP_enc, 0x4, 0, 4, false);
2807     emit_opcode(cbuf,0xC7);                          // mov [esp+0],0   - [ESP] = (double)(1<<int(Q)) = 2^int(Q)
2808     encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
2809     emit_d32(cbuf,0);
2810     emit_opcode(cbuf,0xDC);                          // fmul dword st(0),[esp+0]; FPR1 = 2^int(Q)*2^frac(Q) = 2^Q
2811     encode_RegMem(cbuf, 0x1, ESP_enc, 0x4, 0, 0, false);
2812   %}
2813 
2814 //   enc_class Pop_Reg_Mod_D( regD dst, regD src)
2815 //   was replaced by Push_Result_Mod_D followed by Pop_Reg_X() or Pop_Mem_X()
2816 
2817   enc_class Push_Result_Mod_D( regD src) %{
2818     if ($src$$reg != FPR1L_enc) {
2819       // fincstp
2820       emit_opcode (cbuf, 0xD9);
2821       emit_opcode (cbuf, 0xF7);
2822       // FXCH FPR1 with src
2823       emit_opcode(cbuf, 0xD9);
2824       emit_d8(cbuf, 0xC8-1+$src$$reg );
2825       // fdecstp
2826       emit_opcode (cbuf, 0xD9);
2827       emit_opcode (cbuf, 0xF6);
2828     }
2829     // // following asm replaced with Pop_Reg_F or Pop_Mem_F
2830     // // FSTP   FPR$dst$$reg
2831     // emit_opcode( cbuf, 0xDD );
2832     // emit_d8( cbuf, 0xD8+$dst$$reg );
2833   %}
2834 
2835   enc_class fnstsw_sahf_skip_parity() %{
2836     // fnstsw ax
2837     emit_opcode( cbuf, 0xDF );
2838     emit_opcode( cbuf, 0xE0 );
2839     // sahf
2840     emit_opcode( cbuf, 0x9E );
2841     // jnp  ::skip
2842     emit_opcode( cbuf, 0x7B );
2843     emit_opcode( cbuf, 0x05 );
2844   %}
2845 
2846   enc_class emitModD() %{
2847     // fprem must be iterative
2848     // :: loop
2849     // fprem
2850     emit_opcode( cbuf, 0xD9 );
2851     emit_opcode( cbuf, 0xF8 );
2852     // wait
2853     emit_opcode( cbuf, 0x9b );
2854     // fnstsw ax
2855     emit_opcode( cbuf, 0xDF );
2856     emit_opcode( cbuf, 0xE0 );
2857     // sahf
2858     emit_opcode( cbuf, 0x9E );
2859     // jp  ::loop
2860     emit_opcode( cbuf, 0x0F );
2861     emit_opcode( cbuf, 0x8A );
2862     emit_opcode( cbuf, 0xF4 );
2863     emit_opcode( cbuf, 0xFF );
2864     emit_opcode( cbuf, 0xFF );
2865     emit_opcode( cbuf, 0xFF );
2866   %}
2867 
2868   enc_class fpu_flags() %{
2869     // fnstsw_ax
2870     emit_opcode( cbuf, 0xDF);
2871     emit_opcode( cbuf, 0xE0);
2872     // test ax,0x0400
2873     emit_opcode( cbuf, 0x66 );   // operand-size prefix for 16-bit immediate
2874     emit_opcode( cbuf, 0xA9 );
2875     emit_d16   ( cbuf, 0x0400 );
2876     // // // This sequence works, but stalls for 12-16 cycles on PPro
2877     // // test rax,0x0400
2878     // emit_opcode( cbuf, 0xA9 );
2879     // emit_d32   ( cbuf, 0x00000400 );
2880     //
2881     // jz exit (no unordered comparison)
2882     emit_opcode( cbuf, 0x74 );
2883     emit_d8    ( cbuf, 0x02 );
2884     // mov ah,1 - treat as LT case (set carry flag)
2885     emit_opcode( cbuf, 0xB4 );
2886     emit_d8    ( cbuf, 0x01 );
2887     // sahf
2888     emit_opcode( cbuf, 0x9E);
2889   %}
2890 
2891   enc_class cmpF_P6_fixup() %{
2892     // Fixup the integer flags in case comparison involved a NaN
2893     //
2894     // JNP exit (no unordered comparison, P-flag is set by NaN)
2895     emit_opcode( cbuf, 0x7B );
2896     emit_d8    ( cbuf, 0x03 );
2897     // MOV AH,1 - treat as LT case (set carry flag)
2898     emit_opcode( cbuf, 0xB4 );
2899     emit_d8    ( cbuf, 0x01 );
2900     // SAHF
2901     emit_opcode( cbuf, 0x9E);
2902     // NOP     // target for branch to avoid branch to branch
2903     emit_opcode( cbuf, 0x90);
2904   %}
2905 
2906 //     fnstsw_ax();
2907 //     sahf();
2908 //     movl(dst, nan_result);
2909 //     jcc(Assembler::parity, exit);
2910 //     movl(dst, less_result);
2911 //     jcc(Assembler::below, exit);
2912 //     movl(dst, equal_result);
2913 //     jcc(Assembler::equal, exit);
2914 //     movl(dst, greater_result);
2915 
2916 // less_result     =  1;
2917 // greater_result  = -1;
2918 // equal_result    = 0;
2919 // nan_result      = -1;
2920 
2921   enc_class CmpF_Result(eRegI dst) %{
2922     // fnstsw_ax();
2923     emit_opcode( cbuf, 0xDF);
2924     emit_opcode( cbuf, 0xE0);
2925     // sahf
2926     emit_opcode( cbuf, 0x9E);
2927     // movl(dst, nan_result);
2928     emit_opcode( cbuf, 0xB8 + $dst$$reg);
2929     emit_d32( cbuf, -1 );
2930     // jcc(Assembler::parity, exit);
2931     emit_opcode( cbuf, 0x7A );
2932     emit_d8    ( cbuf, 0x13 );
2933     // movl(dst, less_result);
2934     emit_opcode( cbuf, 0xB8 + $dst$$reg);
2935     emit_d32( cbuf, -1 );
2936     // jcc(Assembler::below, exit);
2937     emit_opcode( cbuf, 0x72 );
2938     emit_d8    ( cbuf, 0x0C );
2939     // movl(dst, equal_result);
2940     emit_opcode( cbuf, 0xB8 + $dst$$reg);
2941     emit_d32( cbuf, 0 );
2942     // jcc(Assembler::equal, exit);
2943     emit_opcode( cbuf, 0x74 );
2944     emit_d8    ( cbuf, 0x05 );
2945     // movl(dst, greater_result);
2946     emit_opcode( cbuf, 0xB8 + $dst$$reg);
2947     emit_d32( cbuf, 1 );
2948   %}
2949 
2950 
2951   // XMM version of CmpF_Result. Because the XMM compare
2952   // instructions set the EFLAGS directly. It becomes simpler than
2953   // the float version above.
2954   enc_class CmpX_Result(eRegI dst) %{
2955     MacroAssembler _masm(&cbuf);
2956     Label nan, inc, done;
2957 
2958     __ jccb(Assembler::parity, nan);
2959     __ jccb(Assembler::equal,  done);
2960     __ jccb(Assembler::above,  inc);
2961     __ bind(nan);
2962     __ decrement(as_Register($dst$$reg)); // NO L qqq
2963     __ jmpb(done);
2964     __ bind(inc);
2965     __ increment(as_Register($dst$$reg)); // NO L qqq
2966     __ bind(done);
2967   %}
2968 
2969   // Compare the longs and set flags
2970   // BROKEN!  Do Not use as-is
2971   enc_class cmpl_test( eRegL src1, eRegL src2 ) %{
2972     // CMP    $src1.hi,$src2.hi
2973     emit_opcode( cbuf, 0x3B );
2974     emit_rm(cbuf, 0x3, HIGH_FROM_LOW($src1$$reg), HIGH_FROM_LOW($src2$$reg) );
2975     // JNE,s  done
2976     emit_opcode(cbuf,0x75);
2977     emit_d8(cbuf, 2 );
2978     // CMP    $src1.lo,$src2.lo
2979     emit_opcode( cbuf, 0x3B );
2980     emit_rm(cbuf, 0x3, $src1$$reg, $src2$$reg );
2981 // done:
2982   %}
2983 
2984   enc_class convert_int_long( regL dst, eRegI src ) %{
2985     // mov $dst.lo,$src
2986     int dst_encoding = $dst$$reg;
2987     int src_encoding = $src$$reg;
2988     encode_Copy( cbuf, dst_encoding  , src_encoding );
2989     // mov $dst.hi,$src
2990     encode_Copy( cbuf, HIGH_FROM_LOW(dst_encoding), src_encoding );
2991     // sar $dst.hi,31
2992     emit_opcode( cbuf, 0xC1 );
2993     emit_rm(cbuf, 0x3, 7, HIGH_FROM_LOW(dst_encoding) );
2994     emit_d8(cbuf, 0x1F );
2995   %}
2996 
2997   enc_class convert_long_double( eRegL src ) %{
2998     // push $src.hi
2999     emit_opcode(cbuf, 0x50+HIGH_FROM_LOW($src$$reg));
3000     // push $src.lo
3001     emit_opcode(cbuf, 0x50+$src$$reg  );
3002     // fild 64-bits at [SP]
3003     emit_opcode(cbuf,0xdf);
3004     emit_d8(cbuf, 0x6C);
3005     emit_d8(cbuf, 0x24);
3006     emit_d8(cbuf, 0x00);
3007     // pop stack
3008     emit_opcode(cbuf, 0x83); // add  SP, #8
3009     emit_rm(cbuf, 0x3, 0x00, ESP_enc);
3010     emit_d8(cbuf, 0x8);
3011   %}
3012 
3013   enc_class multiply_con_and_shift_high( eDXRegI dst, nadxRegI src1, eADXRegL_low_only src2, immI_32_63 cnt, eFlagsReg cr ) %{
3014     // IMUL   EDX:EAX,$src1
3015     emit_opcode( cbuf, 0xF7 );
3016     emit_rm( cbuf, 0x3, 0x5, $src1$$reg );
3017     // SAR    EDX,$cnt-32
3018     int shift_count = ((int)$cnt$$constant) - 32;
3019     if (shift_count > 0) {
3020       emit_opcode(cbuf, 0xC1);
3021       emit_rm(cbuf, 0x3, 7, $dst$$reg );
3022       emit_d8(cbuf, shift_count);
3023     }
3024   %}
3025 
3026   // this version doesn't have add sp, 8
3027   enc_class convert_long_double2( eRegL src ) %{
3028     // push $src.hi
3029     emit_opcode(cbuf, 0x50+HIGH_FROM_LOW($src$$reg));
3030     // push $src.lo
3031     emit_opcode(cbuf, 0x50+$src$$reg  );
3032     // fild 64-bits at [SP]
3033     emit_opcode(cbuf,0xdf);
3034     emit_d8(cbuf, 0x6C);
3035     emit_d8(cbuf, 0x24);
3036     emit_d8(cbuf, 0x00);
3037   %}
3038 
3039   enc_class long_int_multiply( eADXRegL dst, nadxRegI src) %{
3040     // Basic idea: long = (long)int * (long)int
3041     // IMUL EDX:EAX, src
3042     emit_opcode( cbuf, 0xF7 );
3043     emit_rm( cbuf, 0x3, 0x5, $src$$reg);
3044   %}
3045 
3046   enc_class long_uint_multiply( eADXRegL dst, nadxRegI src) %{
3047     // Basic Idea:  long = (int & 0xffffffffL) * (int & 0xffffffffL)
3048     // MUL EDX:EAX, src
3049     emit_opcode( cbuf, 0xF7 );
3050     emit_rm( cbuf, 0x3, 0x4, $src$$reg);
3051   %}
3052 
3053   enc_class long_multiply( eADXRegL dst, eRegL src, eRegI tmp ) %{
3054     // Basic idea: lo(result) = lo(x_lo * y_lo)
3055     //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
3056     // MOV    $tmp,$src.lo
3057     encode_Copy( cbuf, $tmp$$reg, $src$$reg );
3058     // IMUL   $tmp,EDX
3059     emit_opcode( cbuf, 0x0F );
3060     emit_opcode( cbuf, 0xAF );
3061     emit_rm( cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($dst$$reg) );
3062     // MOV    EDX,$src.hi
3063     encode_Copy( cbuf, HIGH_FROM_LOW($dst$$reg), HIGH_FROM_LOW($src$$reg) );
3064     // IMUL   EDX,EAX
3065     emit_opcode( cbuf, 0x0F );
3066     emit_opcode( cbuf, 0xAF );
3067     emit_rm( cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), $dst$$reg );
3068     // ADD    $tmp,EDX
3069     emit_opcode( cbuf, 0x03 );
3070     emit_rm( cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($dst$$reg) );
3071     // MUL   EDX:EAX,$src.lo
3072     emit_opcode( cbuf, 0xF7 );
3073     emit_rm( cbuf, 0x3, 0x4, $src$$reg );
3074     // ADD    EDX,ESI
3075     emit_opcode( cbuf, 0x03 );
3076     emit_rm( cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), $tmp$$reg );
3077   %}
3078 
3079   enc_class long_multiply_con( eADXRegL dst, immL_127 src, eRegI tmp ) %{
3080     // Basic idea: lo(result) = lo(src * y_lo)
3081     //             hi(result) = hi(src * y_lo) + lo(src * y_hi)
3082     // IMUL   $tmp,EDX,$src
3083     emit_opcode( cbuf, 0x6B );
3084     emit_rm( cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($dst$$reg) );
3085     emit_d8( cbuf, (int)$src$$constant );
3086     // MOV    EDX,$src
3087     emit_opcode(cbuf, 0xB8 + EDX_enc);
3088     emit_d32( cbuf, (int)$src$$constant );
3089     // MUL   EDX:EAX,EDX
3090     emit_opcode( cbuf, 0xF7 );
3091     emit_rm( cbuf, 0x3, 0x4, EDX_enc );
3092     // ADD    EDX,ESI
3093     emit_opcode( cbuf, 0x03 );
3094     emit_rm( cbuf, 0x3, EDX_enc, $tmp$$reg );
3095   %}
3096 
3097   enc_class long_div( eRegL src1, eRegL src2 ) %{
3098     // PUSH src1.hi
3099     emit_opcode(cbuf, HIGH_FROM_LOW(0x50+$src1$$reg) );
3100     // PUSH src1.lo
3101     emit_opcode(cbuf,               0x50+$src1$$reg  );
3102     // PUSH src2.hi
3103     emit_opcode(cbuf, HIGH_FROM_LOW(0x50+$src2$$reg) );
3104     // PUSH src2.lo
3105     emit_opcode(cbuf,               0x50+$src2$$reg  );
3106     // CALL directly to the runtime
3107     cbuf.set_inst_mark();
3108     emit_opcode(cbuf,0xE8);       // Call into runtime
3109     emit_d32_reloc(cbuf, (CAST_FROM_FN_PTR(address, SharedRuntime::ldiv) - cbuf.code_end()) - 4, runtime_call_Relocation::spec(), RELOC_IMM32 );
3110     // Restore stack
3111     emit_opcode(cbuf, 0x83); // add  SP, #framesize
3112     emit_rm(cbuf, 0x3, 0x00, ESP_enc);
3113     emit_d8(cbuf, 4*4);
3114   %}
3115 
3116   enc_class long_mod( eRegL src1, eRegL src2 ) %{
3117     // PUSH src1.hi
3118     emit_opcode(cbuf, HIGH_FROM_LOW(0x50+$src1$$reg) );
3119     // PUSH src1.lo
3120     emit_opcode(cbuf,               0x50+$src1$$reg  );
3121     // PUSH src2.hi
3122     emit_opcode(cbuf, HIGH_FROM_LOW(0x50+$src2$$reg) );
3123     // PUSH src2.lo
3124     emit_opcode(cbuf,               0x50+$src2$$reg  );
3125     // CALL directly to the runtime
3126     cbuf.set_inst_mark();
3127     emit_opcode(cbuf,0xE8);       // Call into runtime
3128     emit_d32_reloc(cbuf, (CAST_FROM_FN_PTR(address, SharedRuntime::lrem ) - cbuf.code_end()) - 4, runtime_call_Relocation::spec(), RELOC_IMM32 );
3129     // Restore stack
3130     emit_opcode(cbuf, 0x83); // add  SP, #framesize
3131     emit_rm(cbuf, 0x3, 0x00, ESP_enc);
3132     emit_d8(cbuf, 4*4);
3133   %}
3134 
3135   enc_class long_cmp_flags0( eRegL src, eRegI tmp ) %{
3136     // MOV   $tmp,$src.lo
3137     emit_opcode(cbuf, 0x8B);
3138     emit_rm(cbuf, 0x3, $tmp$$reg, $src$$reg);
3139     // OR    $tmp,$src.hi
3140     emit_opcode(cbuf, 0x0B);
3141     emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($src$$reg));
3142   %}
3143 
3144   enc_class long_cmp_flags1( eRegL src1, eRegL src2 ) %{
3145     // CMP    $src1.lo,$src2.lo
3146     emit_opcode( cbuf, 0x3B );
3147     emit_rm(cbuf, 0x3, $src1$$reg, $src2$$reg );
3148     // JNE,s  skip
3149     emit_cc(cbuf, 0x70, 0x5);
3150     emit_d8(cbuf,2);
3151     // CMP    $src1.hi,$src2.hi
3152     emit_opcode( cbuf, 0x3B );
3153     emit_rm(cbuf, 0x3, HIGH_FROM_LOW($src1$$reg), HIGH_FROM_LOW($src2$$reg) );
3154   %}
3155 
3156   enc_class long_cmp_flags2( eRegL src1, eRegL src2, eRegI tmp ) %{
3157     // CMP    $src1.lo,$src2.lo\t! Long compare; set flags for low bits
3158     emit_opcode( cbuf, 0x3B );
3159     emit_rm(cbuf, 0x3, $src1$$reg, $src2$$reg );
3160     // MOV    $tmp,$src1.hi
3161     emit_opcode( cbuf, 0x8B );
3162     emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($src1$$reg) );
3163     // SBB   $tmp,$src2.hi\t! Compute flags for long compare
3164     emit_opcode( cbuf, 0x1B );
3165     emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($src2$$reg) );
3166   %}
3167 
3168   enc_class long_cmp_flags3( eRegL src, eRegI tmp ) %{
3169     // XOR    $tmp,$tmp
3170     emit_opcode(cbuf,0x33);  // XOR
3171     emit_rm(cbuf,0x3, $tmp$$reg, $tmp$$reg);
3172     // CMP    $tmp,$src.lo
3173     emit_opcode( cbuf, 0x3B );
3174     emit_rm(cbuf, 0x3, $tmp$$reg, $src$$reg );
3175     // SBB    $tmp,$src.hi
3176     emit_opcode( cbuf, 0x1B );
3177     emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($src$$reg) );
3178   %}
3179 
3180  // Sniff, sniff... smells like Gnu Superoptimizer
3181   enc_class neg_long( eRegL dst ) %{
3182     emit_opcode(cbuf,0xF7);    // NEG hi
3183     emit_rm    (cbuf,0x3, 0x3, HIGH_FROM_LOW($dst$$reg));
3184     emit_opcode(cbuf,0xF7);    // NEG lo
3185     emit_rm    (cbuf,0x3, 0x3,               $dst$$reg );
3186     emit_opcode(cbuf,0x83);    // SBB hi,0
3187     emit_rm    (cbuf,0x3, 0x3, HIGH_FROM_LOW($dst$$reg));
3188     emit_d8    (cbuf,0 );
3189   %}
3190 
3191   enc_class movq_ld(regXD dst, memory mem) %{
3192     MacroAssembler _masm(&cbuf);
3193     __ movq($dst$$XMMRegister, $mem$$Address);
3194   %}
3195 
3196   enc_class movq_st(memory mem, regXD src) %{
3197     MacroAssembler _masm(&cbuf);
3198     __ movq($mem$$Address, $src$$XMMRegister);
3199   %}
3200 
3201   enc_class pshufd_8x8(regX dst, regX src) %{
3202     MacroAssembler _masm(&cbuf);
3203 
3204     encode_CopyXD(cbuf, $dst$$reg, $src$$reg);
3205     __ punpcklbw(as_XMMRegister($dst$$reg), as_XMMRegister($dst$$reg));
3206     __ pshuflw(as_XMMRegister($dst$$reg), as_XMMRegister($dst$$reg), 0x00);
3207   %}
3208 
3209   enc_class pshufd_4x16(regX dst, regX src) %{
3210     MacroAssembler _masm(&cbuf);
3211 
3212     __ pshuflw(as_XMMRegister($dst$$reg), as_XMMRegister($src$$reg), 0x00);
3213   %}
3214 
3215   enc_class pshufd(regXD dst, regXD src, int mode) %{
3216     MacroAssembler _masm(&cbuf);
3217 
3218     __ pshufd(as_XMMRegister($dst$$reg), as_XMMRegister($src$$reg), $mode);
3219   %}
3220 
3221   enc_class pxor(regXD dst, regXD src) %{
3222     MacroAssembler _masm(&cbuf);
3223 
3224     __ pxor(as_XMMRegister($dst$$reg), as_XMMRegister($src$$reg));
3225   %}
3226 
3227   enc_class mov_i2x(regXD dst, eRegI src) %{
3228     MacroAssembler _masm(&cbuf);
3229 
3230     __ movdl(as_XMMRegister($dst$$reg), as_Register($src$$reg));
3231   %}
3232 
3233 
3234   // Because the transitions from emitted code to the runtime
3235   // monitorenter/exit helper stubs are so slow it's critical that
3236   // we inline both the stack-locking fast-path and the inflated fast path.
3237   //
3238   // See also: cmpFastLock and cmpFastUnlock.
3239   //
3240   // What follows is a specialized inline transliteration of the code
3241   // in slow_enter() and slow_exit().  If we're concerned about I$ bloat
3242   // another option would be to emit TrySlowEnter and TrySlowExit methods
3243   // at startup-time.  These methods would accept arguments as
3244   // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
3245   // indications in the icc.ZFlag.  Fast_Lock and Fast_Unlock would simply
3246   // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
3247   // In practice, however, the # of lock sites is bounded and is usually small.
3248   // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
3249   // if the processor uses simple bimodal branch predictors keyed by EIP
3250   // Since the helper routines would be called from multiple synchronization
3251   // sites.
3252   //
3253   // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
3254   // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
3255   // to those specialized methods.  That'd give us a mostly platform-independent
3256   // implementation that the JITs could optimize and inline at their pleasure.
3257   // Done correctly, the only time we'd need to cross to native could would be
3258   // to park() or unpark() threads.  We'd also need a few more unsafe operators
3259   // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
3260   // (b) explicit barriers or fence operations.
3261   //
3262   // TODO:
3263   //
3264   // *  Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
3265   //    This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
3266   //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
3267   //    the lock operators would typically be faster than reifying Self.
3268   //
3269   // *  Ideally I'd define the primitives as:
3270   //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
3271   //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
3272   //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
3273   //    Instead, we're stuck with a rather awkward and brittle register assignments below.
3274   //    Furthermore the register assignments are overconstrained, possibly resulting in
3275   //    sub-optimal code near the synchronization site.
3276   //
3277   // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
3278   //    Alternately, use a better sp-proximity test.
3279   //
3280   // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
3281   //    Either one is sufficient to uniquely identify a thread.
3282   //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
3283   //
3284   // *  Intrinsify notify() and notifyAll() for the common cases where the
3285   //    object is locked by the calling thread but the waitlist is empty.
3286   //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
3287   //
3288   // *  use jccb and jmpb instead of jcc and jmp to improve code density.
3289   //    But beware of excessive branch density on AMD Opterons.
3290   //
3291   // *  Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
3292   //    or failure of the fast-path.  If the fast-path fails then we pass
3293   //    control to the slow-path, typically in C.  In Fast_Lock and
3294   //    Fast_Unlock we often branch to DONE_LABEL, just to find that C2
3295   //    will emit a conditional branch immediately after the node.
3296   //    So we have branches to branches and lots of ICC.ZF games.
3297   //    Instead, it might be better to have C2 pass a "FailureLabel"
3298   //    into Fast_Lock and Fast_Unlock.  In the case of success, control
3299   //    will drop through the node.  ICC.ZF is undefined at exit.
3300   //    In the case of failure, the node will branch directly to the
3301   //    FailureLabel
3302 
3303 
3304   // obj: object to lock
3305   // box: on-stack box address (displaced header location) - KILLED
3306   // rax,: tmp -- KILLED
3307   // scr: tmp -- KILLED
3308   enc_class Fast_Lock( eRegP obj, eRegP box, eAXRegI tmp, eRegP scr ) %{
3309 
3310     Register objReg = as_Register($obj$$reg);
3311     Register boxReg = as_Register($box$$reg);
3312     Register tmpReg = as_Register($tmp$$reg);
3313     Register scrReg = as_Register($scr$$reg);
3314 
3315     // Ensure the register assignents are disjoint
3316     guarantee (objReg != boxReg, "") ;
3317     guarantee (objReg != tmpReg, "") ;
3318     guarantee (objReg != scrReg, "") ;
3319     guarantee (boxReg != tmpReg, "") ;
3320     guarantee (boxReg != scrReg, "") ;
3321     guarantee (tmpReg == as_Register(EAX_enc), "") ;
3322 
3323     MacroAssembler masm(&cbuf);
3324 
3325     if (_counters != NULL) {
3326       masm.atomic_incl(ExternalAddress((address) _counters->total_entry_count_addr()));
3327     }
3328     if (EmitSync & 1) {
3329         // set box->dhw = unused_mark (3)
3330         // Force all sync thru slow-path: slow_enter() and slow_exit() 
3331         masm.movptr (Address(boxReg, 0), int32_t(markOopDesc::unused_mark())) ;             
3332         masm.cmpptr (rsp, (int32_t)0) ;                        
3333     } else 
3334     if (EmitSync & 2) { 
3335         Label DONE_LABEL ;           
3336         if (UseBiasedLocking) {
3337            // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument.
3338            masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters);
3339         }
3340 
3341         masm.movptr(tmpReg, Address(objReg, 0)) ;          // fetch markword 
3342         masm.orptr (tmpReg, 0x1);
3343         masm.movptr(Address(boxReg, 0), tmpReg);           // Anticipate successful CAS 
3344         if (os::is_MP()) { masm.lock();  }
3345         masm.cmpxchgptr(boxReg, Address(objReg, 0));          // Updates tmpReg
3346         masm.jcc(Assembler::equal, DONE_LABEL);
3347         // Recursive locking
3348         masm.subptr(tmpReg, rsp);
3349         masm.andptr(tmpReg, (int32_t) 0xFFFFF003 );
3350         masm.movptr(Address(boxReg, 0), tmpReg);
3351         masm.bind(DONE_LABEL) ; 
3352     } else {  
3353       // Possible cases that we'll encounter in fast_lock 
3354       // ------------------------------------------------
3355       // * Inflated
3356       //    -- unlocked
3357       //    -- Locked
3358       //       = by self
3359       //       = by other
3360       // * biased
3361       //    -- by Self
3362       //    -- by other
3363       // * neutral
3364       // * stack-locked
3365       //    -- by self
3366       //       = sp-proximity test hits
3367       //       = sp-proximity test generates false-negative
3368       //    -- by other
3369       //
3370 
3371       Label IsInflated, DONE_LABEL, PopDone ;
3372 
3373       // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
3374       // order to reduce the number of conditional branches in the most common cases.
3375       // Beware -- there's a subtle invariant that fetch of the markword
3376       // at [FETCH], below, will never observe a biased encoding (*101b).
3377       // If this invariant is not held we risk exclusion (safety) failure.
3378       if (UseBiasedLocking && !UseOptoBiasInlining) {
3379         masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters);
3380       }
3381 
3382       masm.movptr(tmpReg, Address(objReg, 0)) ;         // [FETCH]
3383       masm.testptr(tmpReg, 0x02) ;                      // Inflated v (Stack-locked or neutral)
3384       masm.jccb  (Assembler::notZero, IsInflated) ;
3385 
3386       // Attempt stack-locking ...
3387       masm.orptr (tmpReg, 0x1);
3388       masm.movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
3389       if (os::is_MP()) { masm.lock();  }
3390       masm.cmpxchgptr(boxReg, Address(objReg, 0));           // Updates tmpReg
3391       if (_counters != NULL) {
3392         masm.cond_inc32(Assembler::equal,
3393                         ExternalAddress((address)_counters->fast_path_entry_count_addr()));
3394       }
3395       masm.jccb (Assembler::equal, DONE_LABEL);
3396 
3397       // Recursive locking
3398       masm.subptr(tmpReg, rsp);
3399       masm.andptr(tmpReg, 0xFFFFF003 );
3400       masm.movptr(Address(boxReg, 0), tmpReg);
3401       if (_counters != NULL) {
3402         masm.cond_inc32(Assembler::equal,
3403                         ExternalAddress((address)_counters->fast_path_entry_count_addr()));
3404       }
3405       masm.jmp  (DONE_LABEL) ;
3406 
3407       masm.bind (IsInflated) ;
3408 
3409       // The object is inflated.
3410       //
3411       // TODO-FIXME: eliminate the ugly use of manifest constants:
3412       //   Use markOopDesc::monitor_value instead of "2".
3413       //   use markOop::unused_mark() instead of "3".
3414       // The tmpReg value is an objectMonitor reference ORed with
3415       // markOopDesc::monitor_value (2).   We can either convert tmpReg to an
3416       // objectmonitor pointer by masking off the "2" bit or we can just
3417       // use tmpReg as an objectmonitor pointer but bias the objectmonitor
3418       // field offsets with "-2" to compensate for and annul the low-order tag bit.
3419       //
3420       // I use the latter as it avoids AGI stalls.
3421       // As such, we write "mov r, [tmpReg+OFFSETOF(Owner)-2]"
3422       // instead of "mov r, [tmpReg+OFFSETOF(Owner)]".
3423       //
3424       #define OFFSET_SKEWED(f) ((ObjectMonitor::f ## _offset_in_bytes())-2)
3425 
3426       // boxReg refers to the on-stack BasicLock in the current frame.
3427       // We'd like to write:
3428       //   set box->_displaced_header = markOop::unused_mark().  Any non-0 value suffices.
3429       // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
3430       // additional latency as we have another ST in the store buffer that must drain.
3431 
3432       if (EmitSync & 8192) { 
3433          masm.movptr(Address(boxReg, 0), 3) ;            // results in ST-before-CAS penalty
3434          masm.get_thread (scrReg) ; 
3435          masm.movptr(boxReg, tmpReg);                    // consider: LEA box, [tmp-2] 
3436          masm.movptr(tmpReg, NULL_WORD);                 // consider: xor vs mov
3437          if (os::is_MP()) { masm.lock(); } 
3438          masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 
3439       } else 
3440       if ((EmitSync & 128) == 0) {                      // avoid ST-before-CAS
3441          masm.movptr(scrReg, boxReg) ; 
3442          masm.movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2] 
3443 
3444          // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
3445          if ((EmitSync & 2048) && VM_Version::supports_3dnow() && os::is_MP()) {
3446             // prefetchw [eax + Offset(_owner)-2]
3447             masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2));
3448          }
3449 
3450          if ((EmitSync & 64) == 0) {
3451            // Optimistic form: consider XORL tmpReg,tmpReg
3452            masm.movptr(tmpReg, NULL_WORD) ; 
3453          } else { 
3454            // Can suffer RTS->RTO upgrades on shared or cold $ lines
3455            // Test-And-CAS instead of CAS
3456            masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;   // rax, = m->_owner
3457            masm.testptr(tmpReg, tmpReg) ;                   // Locked ? 
3458            masm.jccb  (Assembler::notZero, DONE_LABEL) ;                   
3459          }
3460 
3461          // Appears unlocked - try to swing _owner from null to non-null.
3462          // Ideally, I'd manifest "Self" with get_thread and then attempt
3463          // to CAS the register containing Self into m->Owner.
3464          // But we don't have enough registers, so instead we can either try to CAS
3465          // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
3466          // we later store "Self" into m->Owner.  Transiently storing a stack address
3467          // (rsp or the address of the box) into  m->owner is harmless.
3468          // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
3469          if (os::is_MP()) { masm.lock();  }
3470          masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 
3471          masm.movptr(Address(scrReg, 0), 3) ;          // box->_displaced_header = 3
3472          masm.jccb  (Assembler::notZero, DONE_LABEL) ; 
3473          masm.get_thread (scrReg) ;                    // beware: clobbers ICCs
3474          masm.movptr(Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), scrReg) ; 
3475          masm.xorptr(boxReg, boxReg) ;                 // set icc.ZFlag = 1 to indicate success
3476                        
3477          // If the CAS fails we can either retry or pass control to the slow-path.  
3478          // We use the latter tactic.  
3479          // Pass the CAS result in the icc.ZFlag into DONE_LABEL
3480          // If the CAS was successful ...
3481          //   Self has acquired the lock
3482          //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
3483          // Intentional fall-through into DONE_LABEL ...
3484       } else {
3485          masm.movptr(Address(boxReg, 0), 3) ;       // results in ST-before-CAS penalty
3486          masm.movptr(boxReg, tmpReg) ; 
3487 
3488          // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
3489          if ((EmitSync & 2048) && VM_Version::supports_3dnow() && os::is_MP()) {
3490             // prefetchw [eax + Offset(_owner)-2]
3491             masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2));
3492          }
3493 
3494          if ((EmitSync & 64) == 0) {
3495            // Optimistic form
3496            masm.xorptr  (tmpReg, tmpReg) ; 
3497          } else { 
3498            // Can suffer RTS->RTO upgrades on shared or cold $ lines
3499            masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;   // rax, = m->_owner
3500            masm.testptr(tmpReg, tmpReg) ;                   // Locked ? 
3501            masm.jccb  (Assembler::notZero, DONE_LABEL) ;                   
3502          }
3503 
3504          // Appears unlocked - try to swing _owner from null to non-null.
3505          // Use either "Self" (in scr) or rsp as thread identity in _owner.
3506          // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
3507          masm.get_thread (scrReg) ;
3508          if (os::is_MP()) { masm.lock(); }
3509          masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;
3510 
3511          // If the CAS fails we can either retry or pass control to the slow-path.
3512          // We use the latter tactic.
3513          // Pass the CAS result in the icc.ZFlag into DONE_LABEL
3514          // If the CAS was successful ...
3515          //   Self has acquired the lock
3516          //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
3517          // Intentional fall-through into DONE_LABEL ...
3518       }
3519 
3520       // DONE_LABEL is a hot target - we'd really like to place it at the
3521       // start of cache line by padding with NOPs.
3522       // See the AMD and Intel software optimization manuals for the
3523       // most efficient "long" NOP encodings.
3524       // Unfortunately none of our alignment mechanisms suffice.
3525       masm.bind(DONE_LABEL);
3526 
3527       // Avoid branch-to-branch on AMD processors
3528       // This appears to be superstition.
3529       if (EmitSync & 32) masm.nop() ;
3530 
3531 
3532       // At DONE_LABEL the icc ZFlag is set as follows ...
3533       // Fast_Unlock uses the same protocol.
3534       // ZFlag == 1 -> Success
3535       // ZFlag == 0 -> Failure - force control through the slow-path
3536     }
3537   %}
3538 
3539   // obj: object to unlock
3540   // box: box address (displaced header location), killed.  Must be EAX.
3541   // rbx,: killed tmp; cannot be obj nor box.
3542   //
3543   // Some commentary on balanced locking:
3544   //
3545   // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
3546   // Methods that don't have provably balanced locking are forced to run in the
3547   // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
3548   // The interpreter provides two properties:
3549   // I1:  At return-time the interpreter automatically and quietly unlocks any
3550   //      objects acquired the current activation (frame).  Recall that the
3551   //      interpreter maintains an on-stack list of locks currently held by
3552   //      a frame.
3553   // I2:  If a method attempts to unlock an object that is not held by the
3554   //      the frame the interpreter throws IMSX.
3555   //
3556   // Lets say A(), which has provably balanced locking, acquires O and then calls B().
3557   // B() doesn't have provably balanced locking so it runs in the interpreter.
3558   // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
3559   // is still locked by A().
3560   //
3561   // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
3562   // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
3563   // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
3564   // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
3565 
3566   enc_class Fast_Unlock( nabxRegP obj, eAXRegP box, eRegP tmp) %{
3567 
3568     Register objReg = as_Register($obj$$reg);
3569     Register boxReg = as_Register($box$$reg);
3570     Register tmpReg = as_Register($tmp$$reg);
3571 
3572     guarantee (objReg != boxReg, "") ;
3573     guarantee (objReg != tmpReg, "") ;
3574     guarantee (boxReg != tmpReg, "") ;
3575     guarantee (boxReg == as_Register(EAX_enc), "") ;
3576     MacroAssembler masm(&cbuf);
3577 
3578     if (EmitSync & 4) {
3579       // Disable - inhibit all inlining.  Force control through the slow-path
3580       masm.cmpptr (rsp, 0) ; 
3581     } else 
3582     if (EmitSync & 8) {
3583       Label DONE_LABEL ;
3584       if (UseBiasedLocking) {
3585          masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL);
3586       }
3587       // classic stack-locking code ...
3588       masm.movptr(tmpReg, Address(boxReg, 0)) ;
3589       masm.testptr(tmpReg, tmpReg) ;
3590       masm.jcc   (Assembler::zero, DONE_LABEL) ;
3591       if (os::is_MP()) { masm.lock(); }
3592       masm.cmpxchgptr(tmpReg, Address(objReg, 0));          // Uses EAX which is box
3593       masm.bind(DONE_LABEL);
3594     } else {
3595       Label DONE_LABEL, Stacked, CheckSucc, Inflated ;
3596 
3597       // Critically, the biased locking test must have precedence over
3598       // and appear before the (box->dhw == 0) recursive stack-lock test.
3599       if (UseBiasedLocking && !UseOptoBiasInlining) {
3600          masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL);
3601       }
3602       
3603       masm.cmpptr(Address(boxReg, 0), 0) ;            // Examine the displaced header
3604       masm.movptr(tmpReg, Address(objReg, 0)) ;       // Examine the object's markword
3605       masm.jccb  (Assembler::zero, DONE_LABEL) ;      // 0 indicates recursive stack-lock
3606 
3607       masm.testptr(tmpReg, 0x02) ;                     // Inflated? 
3608       masm.jccb  (Assembler::zero, Stacked) ;
3609 
3610       masm.bind  (Inflated) ;
3611       // It's inflated.
3612       // Despite our balanced locking property we still check that m->_owner == Self
3613       // as java routines or native JNI code called by this thread might
3614       // have released the lock.
3615       // Refer to the comments in synchronizer.cpp for how we might encode extra
3616       // state in _succ so we can avoid fetching EntryList|cxq.
3617       //
3618       // I'd like to add more cases in fast_lock() and fast_unlock() --
3619       // such as recursive enter and exit -- but we have to be wary of
3620       // I$ bloat, T$ effects and BP$ effects.
3621       //
3622       // If there's no contention try a 1-0 exit.  That is, exit without
3623       // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
3624       // we detect and recover from the race that the 1-0 exit admits.
3625       //
3626       // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
3627       // before it STs null into _owner, releasing the lock.  Updates
3628       // to data protected by the critical section must be visible before
3629       // we drop the lock (and thus before any other thread could acquire
3630       // the lock and observe the fields protected by the lock).
3631       // IA32's memory-model is SPO, so STs are ordered with respect to
3632       // each other and there's no need for an explicit barrier (fence).
3633       // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
3634 
3635       masm.get_thread (boxReg) ;
3636       if ((EmitSync & 4096) && VM_Version::supports_3dnow() && os::is_MP()) {
3637         // prefetchw [ebx + Offset(_owner)-2]
3638         masm.prefetchw(Address(rbx, ObjectMonitor::owner_offset_in_bytes()-2));
3639       }
3640 
3641       // Note that we could employ various encoding schemes to reduce
3642       // the number of loads below (currently 4) to just 2 or 3.
3643       // Refer to the comments in synchronizer.cpp.
3644       // In practice the chain of fetches doesn't seem to impact performance, however.
3645       if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
3646          // Attempt to reduce branch density - AMD's branch predictor.
3647          masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;  
3648          masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
3649          masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; 
3650          masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; 
3651          masm.jccb  (Assembler::notZero, DONE_LABEL) ; 
3652          masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ; 
3653          masm.jmpb  (DONE_LABEL) ; 
3654       } else { 
3655          masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;  
3656          masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
3657          masm.jccb  (Assembler::notZero, DONE_LABEL) ; 
3658          masm.movptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; 
3659          masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; 
3660          masm.jccb  (Assembler::notZero, CheckSucc) ; 
3661          masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ; 
3662          masm.jmpb  (DONE_LABEL) ; 
3663       }
3664 
3665       // The Following code fragment (EmitSync & 65536) improves the performance of
3666       // contended applications and contended synchronization microbenchmarks.
3667       // Unfortunately the emission of the code - even though not executed - causes regressions
3668       // in scimark and jetstream, evidently because of $ effects.  Replacing the code
3669       // with an equal number of never-executed NOPs results in the same regression.
3670       // We leave it off by default.
3671 
3672       if ((EmitSync & 65536) != 0) {
3673          Label LSuccess, LGoSlowPath ;
3674 
3675          masm.bind  (CheckSucc) ;
3676 
3677          // Optional pre-test ... it's safe to elide this
3678          if ((EmitSync & 16) == 0) { 
3679             masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ; 
3680             masm.jccb  (Assembler::zero, LGoSlowPath) ; 
3681          }
3682 
3683          // We have a classic Dekker-style idiom:
3684          //    ST m->_owner = 0 ; MEMBAR; LD m->_succ
3685          // There are a number of ways to implement the barrier:
3686          // (1) lock:andl &m->_owner, 0
3687          //     is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
3688          //     LOCK: ANDL [ebx+Offset(_Owner)-2], 0
3689          //     Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
3690          // (2) If supported, an explicit MFENCE is appealing.
3691          //     In older IA32 processors MFENCE is slower than lock:add or xchg
3692          //     particularly if the write-buffer is full as might be the case if
3693          //     if stores closely precede the fence or fence-equivalent instruction.
3694          //     In more modern implementations MFENCE appears faster, however.
3695          // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
3696          //     The $lines underlying the top-of-stack should be in M-state.
3697          //     The locked add instruction is serializing, of course.
3698          // (4) Use xchg, which is serializing
3699          //     mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
3700          // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
3701          //     The integer condition codes will tell us if succ was 0.
3702          //     Since _succ and _owner should reside in the same $line and
3703          //     we just stored into _owner, it's likely that the $line
3704          //     remains in M-state for the lock:orl.
3705          //
3706          // We currently use (3), although it's likely that switching to (2)
3707          // is correct for the future.
3708             
3709          masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ; 
3710          if (os::is_MP()) { 
3711             if (VM_Version::supports_sse2() && 1 == FenceInstruction) { 
3712               masm.mfence();
3713             } else { 
3714               masm.lock () ; masm.addptr(Address(rsp, 0), 0) ; 
3715             }
3716          }
3717          // Ratify _succ remains non-null
3718          masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ; 
3719          masm.jccb  (Assembler::notZero, LSuccess) ; 
3720 
3721          masm.xorptr(boxReg, boxReg) ;                  // box is really EAX
3722          if (os::is_MP()) { masm.lock(); }
3723          masm.cmpxchgptr(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
3724          masm.jccb  (Assembler::notEqual, LSuccess) ;
3725          // Since we're low on registers we installed rsp as a placeholding in _owner.
3726          // Now install Self over rsp.  This is safe as we're transitioning from
3727          // non-null to non=null
3728          masm.get_thread (boxReg) ;
3729          masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg) ;
3730          // Intentional fall-through into LGoSlowPath ...
3731 
3732          masm.bind  (LGoSlowPath) ; 
3733          masm.orptr(boxReg, 1) ;                      // set ICC.ZF=0 to indicate failure
3734          masm.jmpb  (DONE_LABEL) ; 
3735 
3736          masm.bind  (LSuccess) ; 
3737          masm.xorptr(boxReg, boxReg) ;                 // set ICC.ZF=1 to indicate success
3738          masm.jmpb  (DONE_LABEL) ; 
3739       }
3740 
3741       masm.bind (Stacked) ;
3742       // It's not inflated and it's not recursively stack-locked and it's not biased.
3743       // It must be stack-locked.
3744       // Try to reset the header to displaced header.
3745       // The "box" value on the stack is stable, so we can reload
3746       // and be assured we observe the same value as above.
3747       masm.movptr(tmpReg, Address(boxReg, 0)) ;
3748       if (os::is_MP()) {   masm.lock();    }
3749       masm.cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses EAX which is box
3750       // Intention fall-thru into DONE_LABEL
3751 
3752 
3753       // DONE_LABEL is a hot target - we'd really like to place it at the
3754       // start of cache line by padding with NOPs.
3755       // See the AMD and Intel software optimization manuals for the
3756       // most efficient "long" NOP encodings.
3757       // Unfortunately none of our alignment mechanisms suffice.
3758       if ((EmitSync & 65536) == 0) {
3759          masm.bind (CheckSucc) ;
3760       }
3761       masm.bind(DONE_LABEL);
3762 
3763       // Avoid branch to branch on AMD processors
3764       if (EmitSync & 32768) { masm.nop() ; }
3765     }
3766   %}
3767 
3768 
3769   enc_class enc_pop_rdx() %{
3770     emit_opcode(cbuf,0x5A);
3771   %}
3772 
3773   enc_class enc_rethrow() %{
3774     cbuf.set_inst_mark();
3775     emit_opcode(cbuf, 0xE9);        // jmp    entry
3776     emit_d32_reloc(cbuf, (int)OptoRuntime::rethrow_stub() - ((int)cbuf.code_end())-4,
3777                    runtime_call_Relocation::spec(), RELOC_IMM32 );
3778   %}
3779 
3780 
3781   // Convert a double to an int.  Java semantics require we do complex
3782   // manglelations in the corner cases.  So we set the rounding mode to
3783   // 'zero', store the darned double down as an int, and reset the
3784   // rounding mode to 'nearest'.  The hardware throws an exception which
3785   // patches up the correct value directly to the stack.
3786   enc_class D2I_encoding( regD src ) %{
3787     // Flip to round-to-zero mode.  We attempted to allow invalid-op
3788     // exceptions here, so that a NAN or other corner-case value will
3789     // thrown an exception (but normal values get converted at full speed).
3790     // However, I2C adapters and other float-stack manglers leave pending
3791     // invalid-op exceptions hanging.  We would have to clear them before
3792     // enabling them and that is more expensive than just testing for the
3793     // invalid value Intel stores down in the corner cases.
3794     emit_opcode(cbuf,0xD9);            // FLDCW  trunc
3795     emit_opcode(cbuf,0x2D);
3796     emit_d32(cbuf,(int)StubRoutines::addr_fpu_cntrl_wrd_trunc());
3797     // Allocate a word
3798     emit_opcode(cbuf,0x83);            // SUB ESP,4
3799     emit_opcode(cbuf,0xEC);
3800     emit_d8(cbuf,0x04);
3801     // Encoding assumes a double has been pushed into FPR0.
3802     // Store down the double as an int, popping the FPU stack
3803     emit_opcode(cbuf,0xDB);            // FISTP [ESP]
3804     emit_opcode(cbuf,0x1C);
3805     emit_d8(cbuf,0x24);
3806     // Restore the rounding mode; mask the exception
3807     emit_opcode(cbuf,0xD9);            // FLDCW   std/24-bit mode
3808     emit_opcode(cbuf,0x2D);
3809     emit_d32( cbuf, Compile::current()->in_24_bit_fp_mode()
3810         ? (int)StubRoutines::addr_fpu_cntrl_wrd_24()
3811         : (int)StubRoutines::addr_fpu_cntrl_wrd_std());
3812 
3813     // Load the converted int; adjust CPU stack
3814     emit_opcode(cbuf,0x58);       // POP EAX
3815     emit_opcode(cbuf,0x3D);       // CMP EAX,imm
3816     emit_d32   (cbuf,0x80000000); //         0x80000000
3817     emit_opcode(cbuf,0x75);       // JNE around_slow_call
3818     emit_d8    (cbuf,0x07);       // Size of slow_call
3819     // Push src onto stack slow-path
3820     emit_opcode(cbuf,0xD9 );      // FLD     ST(i)
3821     emit_d8    (cbuf,0xC0-1+$src$$reg );
3822     // CALL directly to the runtime
3823     cbuf.set_inst_mark();
3824     emit_opcode(cbuf,0xE8);       // Call into runtime
3825     emit_d32_reloc(cbuf, (StubRoutines::d2i_wrapper() - cbuf.code_end()) - 4, runtime_call_Relocation::spec(), RELOC_IMM32 );
3826     // Carry on here...
3827   %}
3828 
3829   enc_class D2L_encoding( regD src ) %{
3830     emit_opcode(cbuf,0xD9);            // FLDCW  trunc
3831     emit_opcode(cbuf,0x2D);
3832     emit_d32(cbuf,(int)StubRoutines::addr_fpu_cntrl_wrd_trunc());
3833     // Allocate a word
3834     emit_opcode(cbuf,0x83);            // SUB ESP,8
3835     emit_opcode(cbuf,0xEC);
3836     emit_d8(cbuf,0x08);
3837     // Encoding assumes a double has been pushed into FPR0.
3838     // Store down the double as a long, popping the FPU stack
3839     emit_opcode(cbuf,0xDF);            // FISTP [ESP]
3840     emit_opcode(cbuf,0x3C);
3841     emit_d8(cbuf,0x24);
3842     // Restore the rounding mode; mask the exception
3843     emit_opcode(cbuf,0xD9);            // FLDCW   std/24-bit mode
3844     emit_opcode(cbuf,0x2D);
3845     emit_d32( cbuf, Compile::current()->in_24_bit_fp_mode()
3846         ? (int)StubRoutines::addr_fpu_cntrl_wrd_24()
3847         : (int)StubRoutines::addr_fpu_cntrl_wrd_std());
3848 
3849     // Load the converted int; adjust CPU stack
3850     emit_opcode(cbuf,0x58);       // POP EAX
3851     emit_opcode(cbuf,0x5A);       // POP EDX
3852     emit_opcode(cbuf,0x81);       // CMP EDX,imm
3853     emit_d8    (cbuf,0xFA);       // rdx
3854     emit_d32   (cbuf,0x80000000); //         0x80000000
3855     emit_opcode(cbuf,0x75);       // JNE around_slow_call
3856     emit_d8    (cbuf,0x07+4);     // Size of slow_call
3857     emit_opcode(cbuf,0x85);       // TEST EAX,EAX
3858     emit_opcode(cbuf,0xC0);       // 2/rax,/rax,
3859     emit_opcode(cbuf,0x75);       // JNE around_slow_call
3860     emit_d8    (cbuf,0x07);       // Size of slow_call
3861     // Push src onto stack slow-path
3862     emit_opcode(cbuf,0xD9 );      // FLD     ST(i)
3863     emit_d8    (cbuf,0xC0-1+$src$$reg );
3864     // CALL directly to the runtime
3865     cbuf.set_inst_mark();
3866     emit_opcode(cbuf,0xE8);       // Call into runtime
3867     emit_d32_reloc(cbuf, (StubRoutines::d2l_wrapper() - cbuf.code_end()) - 4, runtime_call_Relocation::spec(), RELOC_IMM32 );
3868     // Carry on here...
3869   %}
3870 
3871   enc_class X2L_encoding( regX src ) %{
3872     // Allocate a word
3873     emit_opcode(cbuf,0x83);      // SUB ESP,8
3874     emit_opcode(cbuf,0xEC);
3875     emit_d8(cbuf,0x08);
3876 
3877     emit_opcode  (cbuf, 0xF3 );  // MOVSS [ESP], src
3878     emit_opcode  (cbuf, 0x0F );
3879     emit_opcode  (cbuf, 0x11 );
3880     encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
3881 
3882     emit_opcode(cbuf,0xD9 );     // FLD_S [ESP]
3883     encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
3884 
3885     emit_opcode(cbuf,0xD9);      // FLDCW  trunc
3886     emit_opcode(cbuf,0x2D);
3887     emit_d32(cbuf,(int)StubRoutines::addr_fpu_cntrl_wrd_trunc());
3888 
3889     // Encoding assumes a double has been pushed into FPR0.
3890     // Store down the double as a long, popping the FPU stack
3891     emit_opcode(cbuf,0xDF);      // FISTP [ESP]
3892     emit_opcode(cbuf,0x3C);
3893     emit_d8(cbuf,0x24);
3894 
3895     // Restore the rounding mode; mask the exception
3896     emit_opcode(cbuf,0xD9);      // FLDCW   std/24-bit mode
3897     emit_opcode(cbuf,0x2D);
3898     emit_d32( cbuf, Compile::current()->in_24_bit_fp_mode()
3899       ? (int)StubRoutines::addr_fpu_cntrl_wrd_24()
3900       : (int)StubRoutines::addr_fpu_cntrl_wrd_std());
3901 
3902     // Load the converted int; adjust CPU stack
3903     emit_opcode(cbuf,0x58);      // POP EAX
3904 
3905     emit_opcode(cbuf,0x5A);      // POP EDX
3906 
3907     emit_opcode(cbuf,0x81);      // CMP EDX,imm
3908     emit_d8    (cbuf,0xFA);      // rdx
3909     emit_d32   (cbuf,0x80000000);//         0x80000000
3910 
3911     emit_opcode(cbuf,0x75);      // JNE around_slow_call
3912     emit_d8    (cbuf,0x13+4);    // Size of slow_call
3913 
3914     emit_opcode(cbuf,0x85);      // TEST EAX,EAX
3915     emit_opcode(cbuf,0xC0);      // 2/rax,/rax,
3916 
3917     emit_opcode(cbuf,0x75);      // JNE around_slow_call
3918     emit_d8    (cbuf,0x13);      // Size of slow_call
3919 
3920     // Allocate a word
3921     emit_opcode(cbuf,0x83);      // SUB ESP,4
3922     emit_opcode(cbuf,0xEC);
3923     emit_d8(cbuf,0x04);
3924 
3925     emit_opcode  (cbuf, 0xF3 );  // MOVSS [ESP], src
3926     emit_opcode  (cbuf, 0x0F );
3927     emit_opcode  (cbuf, 0x11 );
3928     encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
3929 
3930     emit_opcode(cbuf,0xD9 );     // FLD_S [ESP]
3931     encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
3932 
3933     emit_opcode(cbuf,0x83);      // ADD ESP,4
3934     emit_opcode(cbuf,0xC4);
3935     emit_d8(cbuf,0x04);
3936 
3937     // CALL directly to the runtime
3938     cbuf.set_inst_mark();
3939     emit_opcode(cbuf,0xE8);       // Call into runtime
3940     emit_d32_reloc(cbuf, (StubRoutines::d2l_wrapper() - cbuf.code_end()) - 4, runtime_call_Relocation::spec(), RELOC_IMM32 );
3941     // Carry on here...
3942   %}
3943 
3944   enc_class XD2L_encoding( regXD src ) %{
3945     // Allocate a word
3946     emit_opcode(cbuf,0x83);      // SUB ESP,8
3947     emit_opcode(cbuf,0xEC);
3948     emit_d8(cbuf,0x08);
3949 
3950     emit_opcode  (cbuf, 0xF2 );  // MOVSD [ESP], src
3951     emit_opcode  (cbuf, 0x0F );
3952     emit_opcode  (cbuf, 0x11 );
3953     encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
3954 
3955     emit_opcode(cbuf,0xDD );     // FLD_D [ESP]
3956     encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
3957 
3958     emit_opcode(cbuf,0xD9);      // FLDCW  trunc
3959     emit_opcode(cbuf,0x2D);
3960     emit_d32(cbuf,(int)StubRoutines::addr_fpu_cntrl_wrd_trunc());
3961 
3962     // Encoding assumes a double has been pushed into FPR0.
3963     // Store down the double as a long, popping the FPU stack
3964     emit_opcode(cbuf,0xDF);      // FISTP [ESP]
3965     emit_opcode(cbuf,0x3C);
3966     emit_d8(cbuf,0x24);
3967 
3968     // Restore the rounding mode; mask the exception
3969     emit_opcode(cbuf,0xD9);      // FLDCW   std/24-bit mode
3970     emit_opcode(cbuf,0x2D);
3971     emit_d32( cbuf, Compile::current()->in_24_bit_fp_mode()
3972       ? (int)StubRoutines::addr_fpu_cntrl_wrd_24()
3973       : (int)StubRoutines::addr_fpu_cntrl_wrd_std());
3974 
3975     // Load the converted int; adjust CPU stack
3976     emit_opcode(cbuf,0x58);      // POP EAX
3977 
3978     emit_opcode(cbuf,0x5A);      // POP EDX
3979 
3980     emit_opcode(cbuf,0x81);      // CMP EDX,imm
3981     emit_d8    (cbuf,0xFA);      // rdx
3982     emit_d32   (cbuf,0x80000000); //         0x80000000
3983 
3984     emit_opcode(cbuf,0x75);      // JNE around_slow_call
3985     emit_d8    (cbuf,0x13+4);    // Size of slow_call
3986 
3987     emit_opcode(cbuf,0x85);      // TEST EAX,EAX
3988     emit_opcode(cbuf,0xC0);      // 2/rax,/rax,
3989 
3990     emit_opcode(cbuf,0x75);      // JNE around_slow_call
3991     emit_d8    (cbuf,0x13);      // Size of slow_call
3992 
3993     // Push src onto stack slow-path
3994     // Allocate a word
3995     emit_opcode(cbuf,0x83);      // SUB ESP,8
3996     emit_opcode(cbuf,0xEC);
3997     emit_d8(cbuf,0x08);
3998 
3999     emit_opcode  (cbuf, 0xF2 );  // MOVSD [ESP], src
4000     emit_opcode  (cbuf, 0x0F );
4001     emit_opcode  (cbuf, 0x11 );
4002     encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
4003 
4004     emit_opcode(cbuf,0xDD );     // FLD_D [ESP]
4005     encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
4006 
4007     emit_opcode(cbuf,0x83);      // ADD ESP,8
4008     emit_opcode(cbuf,0xC4);
4009     emit_d8(cbuf,0x08);
4010 
4011     // CALL directly to the runtime
4012     cbuf.set_inst_mark();
4013     emit_opcode(cbuf,0xE8);      // Call into runtime
4014     emit_d32_reloc(cbuf, (StubRoutines::d2l_wrapper() - cbuf.code_end()) - 4, runtime_call_Relocation::spec(), RELOC_IMM32 );
4015     // Carry on here...
4016   %}
4017 
4018   enc_class D2X_encoding( regX dst, regD src ) %{
4019     // Allocate a word
4020     emit_opcode(cbuf,0x83);            // SUB ESP,4
4021     emit_opcode(cbuf,0xEC);
4022     emit_d8(cbuf,0x04);
4023     int pop = 0x02;
4024     if ($src$$reg != FPR1L_enc) {
4025       emit_opcode( cbuf, 0xD9 );       // FLD    ST(i-1)
4026       emit_d8( cbuf, 0xC0-1+$src$$reg );
4027       pop = 0x03;
4028     }
4029     store_to_stackslot( cbuf, 0xD9, pop, 0 ); // FST<P>_S  [ESP]
4030 
4031     emit_opcode  (cbuf, 0xF3 );        // MOVSS dst(xmm), [ESP]
4032     emit_opcode  (cbuf, 0x0F );
4033     emit_opcode  (cbuf, 0x10 );
4034     encode_RegMem(cbuf, $dst$$reg, ESP_enc, 0x4, 0, 0, false);
4035 
4036     emit_opcode(cbuf,0x83);            // ADD ESP,4
4037     emit_opcode(cbuf,0xC4);
4038     emit_d8(cbuf,0x04);
4039     // Carry on here...
4040   %}
4041 
4042   enc_class FX2I_encoding( regX src, eRegI dst ) %{
4043     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
4044 
4045     // Compare the result to see if we need to go to the slow path
4046     emit_opcode(cbuf,0x81);       // CMP dst,imm
4047     emit_rm    (cbuf,0x3,0x7,$dst$$reg);
4048     emit_d32   (cbuf,0x80000000); //         0x80000000
4049 
4050     emit_opcode(cbuf,0x75);       // JNE around_slow_call
4051     emit_d8    (cbuf,0x13);       // Size of slow_call
4052     // Store xmm to a temp memory
4053     // location and push it onto stack.
4054 
4055     emit_opcode(cbuf,0x83);  // SUB ESP,4
4056     emit_opcode(cbuf,0xEC);
4057     emit_d8(cbuf, $primary ? 0x8 : 0x4);
4058 
4059     emit_opcode  (cbuf, $primary ? 0xF2 : 0xF3 );   // MOVSS [ESP], xmm
4060     emit_opcode  (cbuf, 0x0F );
4061     emit_opcode  (cbuf, 0x11 );
4062     encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
4063 
4064     emit_opcode(cbuf, $primary ? 0xDD : 0xD9 );      // FLD [ESP]
4065     encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
4066 
4067     emit_opcode(cbuf,0x83);    // ADD ESP,4
4068     emit_opcode(cbuf,0xC4);
4069     emit_d8(cbuf, $primary ? 0x8 : 0x4);
4070 
4071     // CALL directly to the runtime
4072     cbuf.set_inst_mark();
4073     emit_opcode(cbuf,0xE8);       // Call into runtime
4074     emit_d32_reloc(cbuf, (StubRoutines::d2i_wrapper() - cbuf.code_end()) - 4, runtime_call_Relocation::spec(), RELOC_IMM32 );
4075 
4076     // Carry on here...
4077   %}
4078 
4079   enc_class X2D_encoding( regD dst, regX src ) %{
4080     // Allocate a word
4081     emit_opcode(cbuf,0x83);     // SUB ESP,4
4082     emit_opcode(cbuf,0xEC);
4083     emit_d8(cbuf,0x04);
4084 
4085     emit_opcode  (cbuf, 0xF3 ); // MOVSS [ESP], xmm
4086     emit_opcode  (cbuf, 0x0F );
4087     emit_opcode  (cbuf, 0x11 );
4088     encode_RegMem(cbuf, $src$$reg, ESP_enc, 0x4, 0, 0, false);
4089 
4090     emit_opcode(cbuf,0xD9 );    // FLD_S [ESP]
4091     encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
4092 
4093     emit_opcode(cbuf,0x83);     // ADD ESP,4
4094     emit_opcode(cbuf,0xC4);
4095     emit_d8(cbuf,0x04);
4096 
4097     // Carry on here...
4098   %}
4099 
4100   enc_class AbsXF_encoding(regX dst) %{
4101     address signmask_address=(address)float_signmask_pool;
4102     // andpd:\tANDPS  $dst,[signconst]
4103     emit_opcode(cbuf, 0x0F);
4104     emit_opcode(cbuf, 0x54);
4105     emit_rm(cbuf, 0x0, $dst$$reg, 0x5);
4106     emit_d32(cbuf, (int)signmask_address);
4107   %}
4108 
4109   enc_class AbsXD_encoding(regXD dst) %{
4110     address signmask_address=(address)double_signmask_pool;
4111     // andpd:\tANDPD  $dst,[signconst]
4112     emit_opcode(cbuf, 0x66);
4113     emit_opcode(cbuf, 0x0F);
4114     emit_opcode(cbuf, 0x54);
4115     emit_rm(cbuf, 0x0, $dst$$reg, 0x5);
4116     emit_d32(cbuf, (int)signmask_address);
4117   %}
4118 
4119   enc_class NegXF_encoding(regX dst) %{
4120     address signmask_address=(address)float_signflip_pool;
4121     // andpd:\tXORPS  $dst,[signconst]
4122     emit_opcode(cbuf, 0x0F);
4123     emit_opcode(cbuf, 0x57);
4124     emit_rm(cbuf, 0x0, $dst$$reg, 0x5);
4125     emit_d32(cbuf, (int)signmask_address);
4126   %}
4127 
4128   enc_class NegXD_encoding(regXD dst) %{
4129     address signmask_address=(address)double_signflip_pool;
4130     // andpd:\tXORPD  $dst,[signconst]
4131     emit_opcode(cbuf, 0x66);
4132     emit_opcode(cbuf, 0x0F);
4133     emit_opcode(cbuf, 0x57);
4134     emit_rm(cbuf, 0x0, $dst$$reg, 0x5);
4135     emit_d32(cbuf, (int)signmask_address);
4136   %}
4137 
4138   enc_class FMul_ST_reg( eRegF src1 ) %{
4139     // Operand was loaded from memory into fp ST (stack top)
4140     // FMUL   ST,$src  /* D8 C8+i */
4141     emit_opcode(cbuf, 0xD8);
4142     emit_opcode(cbuf, 0xC8 + $src1$$reg);
4143   %}
4144 
4145   enc_class FAdd_ST_reg( eRegF src2 ) %{
4146     // FADDP  ST,src2  /* D8 C0+i */
4147     emit_opcode(cbuf, 0xD8);
4148     emit_opcode(cbuf, 0xC0 + $src2$$reg);
4149     //could use FADDP  src2,fpST  /* DE C0+i */
4150   %}
4151 
4152   enc_class FAddP_reg_ST( eRegF src2 ) %{
4153     // FADDP  src2,ST  /* DE C0+i */
4154     emit_opcode(cbuf, 0xDE);
4155     emit_opcode(cbuf, 0xC0 + $src2$$reg);
4156   %}
4157 
4158   enc_class subF_divF_encode( eRegF src1, eRegF src2) %{
4159     // Operand has been loaded into fp ST (stack top)
4160       // FSUB   ST,$src1
4161       emit_opcode(cbuf, 0xD8);
4162       emit_opcode(cbuf, 0xE0 + $src1$$reg);
4163 
4164       // FDIV
4165       emit_opcode(cbuf, 0xD8);
4166       emit_opcode(cbuf, 0xF0 + $src2$$reg);
4167   %}
4168 
4169   enc_class MulFAddF (eRegF src1, eRegF src2) %{
4170     // Operand was loaded from memory into fp ST (stack top)
4171     // FADD   ST,$src  /* D8 C0+i */
4172     emit_opcode(cbuf, 0xD8);
4173     emit_opcode(cbuf, 0xC0 + $src1$$reg);
4174 
4175     // FMUL  ST,src2  /* D8 C*+i */
4176     emit_opcode(cbuf, 0xD8);
4177     emit_opcode(cbuf, 0xC8 + $src2$$reg);
4178   %}
4179 
4180 
4181   enc_class MulFAddFreverse (eRegF src1, eRegF src2) %{
4182     // Operand was loaded from memory into fp ST (stack top)
4183     // FADD   ST,$src  /* D8 C0+i */
4184     emit_opcode(cbuf, 0xD8);
4185     emit_opcode(cbuf, 0xC0 + $src1$$reg);
4186 
4187     // FMULP  src2,ST  /* DE C8+i */
4188     emit_opcode(cbuf, 0xDE);
4189     emit_opcode(cbuf, 0xC8 + $src2$$reg);
4190   %}
4191 
4192   // Atomically load the volatile long
4193   enc_class enc_loadL_volatile( memory mem, stackSlotL dst ) %{
4194     emit_opcode(cbuf,0xDF);
4195     int rm_byte_opcode = 0x05;
4196     int base     = $mem$$base;
4197     int index    = $mem$$index;
4198     int scale    = $mem$$scale;
4199     int displace = $mem$$disp;
4200     bool disp_is_oop = $mem->disp_is_oop(); // disp-as-oop when working with static globals
4201     encode_RegMem(cbuf, rm_byte_opcode, base, index, scale, displace, disp_is_oop);
4202     store_to_stackslot( cbuf, 0x0DF, 0x07, $dst$$disp );
4203   %}
4204 
4205   enc_class enc_loadLX_volatile( memory mem, stackSlotL dst, regXD tmp ) %{
4206     { // Atomic long load
4207       // UseXmmLoadAndClearUpper ? movsd $tmp,$mem : movlpd $tmp,$mem
4208       emit_opcode(cbuf,UseXmmLoadAndClearUpper ? 0xF2 : 0x66);
4209       emit_opcode(cbuf,0x0F);
4210       emit_opcode(cbuf,UseXmmLoadAndClearUpper ? 0x10 : 0x12);
4211       int base     = $mem$$base;
4212       int index    = $mem$$index;
4213       int scale    = $mem$$scale;
4214       int displace = $mem$$disp;
4215       bool disp_is_oop = $mem->disp_is_oop(); // disp-as-oop when working with static globals
4216       encode_RegMem(cbuf, $tmp$$reg, base, index, scale, displace, disp_is_oop);
4217     }
4218     { // MOVSD $dst,$tmp ! atomic long store
4219       emit_opcode(cbuf,0xF2);
4220       emit_opcode(cbuf,0x0F);
4221       emit_opcode(cbuf,0x11);
4222       int base     = $dst$$base;
4223       int index    = $dst$$index;
4224       int scale    = $dst$$scale;
4225       int displace = $dst$$disp;
4226       bool disp_is_oop = $dst->disp_is_oop(); // disp-as-oop when working with static globals
4227       encode_RegMem(cbuf, $tmp$$reg, base, index, scale, displace, disp_is_oop);
4228     }
4229   %}
4230 
4231   enc_class enc_loadLX_reg_volatile( memory mem, eRegL dst, regXD tmp ) %{
4232     { // Atomic long load
4233       // UseXmmLoadAndClearUpper ? movsd $tmp,$mem : movlpd $tmp,$mem
4234       emit_opcode(cbuf,UseXmmLoadAndClearUpper ? 0xF2 : 0x66);
4235       emit_opcode(cbuf,0x0F);
4236       emit_opcode(cbuf,UseXmmLoadAndClearUpper ? 0x10 : 0x12);
4237       int base     = $mem$$base;
4238       int index    = $mem$$index;
4239       int scale    = $mem$$scale;
4240       int displace = $mem$$disp;
4241       bool disp_is_oop = $mem->disp_is_oop(); // disp-as-oop when working with static globals
4242       encode_RegMem(cbuf, $tmp$$reg, base, index, scale, displace, disp_is_oop);
4243     }
4244     { // MOVD $dst.lo,$tmp
4245       emit_opcode(cbuf,0x66);
4246       emit_opcode(cbuf,0x0F);
4247       emit_opcode(cbuf,0x7E);
4248       emit_rm(cbuf, 0x3, $tmp$$reg, $dst$$reg);
4249     }
4250     { // PSRLQ $tmp,32
4251       emit_opcode(cbuf,0x66);
4252       emit_opcode(cbuf,0x0F);
4253       emit_opcode(cbuf,0x73);
4254       emit_rm(cbuf, 0x3, 0x02, $tmp$$reg);
4255       emit_d8(cbuf, 0x20);
4256     }
4257     { // MOVD $dst.hi,$tmp
4258       emit_opcode(cbuf,0x66);
4259       emit_opcode(cbuf,0x0F);
4260       emit_opcode(cbuf,0x7E);
4261       emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($dst$$reg));
4262     }
4263   %}
4264 
4265   // Volatile Store Long.  Must be atomic, so move it into
4266   // the FP TOS and then do a 64-bit FIST.  Has to probe the
4267   // target address before the store (for null-ptr checks)
4268   // so the memory operand is used twice in the encoding.
4269   enc_class enc_storeL_volatile( memory mem, stackSlotL src ) %{
4270     store_to_stackslot( cbuf, 0x0DF, 0x05, $src$$disp );
4271     cbuf.set_inst_mark();            // Mark start of FIST in case $mem has an oop
4272     emit_opcode(cbuf,0xDF);
4273     int rm_byte_opcode = 0x07;
4274     int base     = $mem$$base;
4275     int index    = $mem$$index;
4276     int scale    = $mem$$scale;
4277     int displace = $mem$$disp;
4278     bool disp_is_oop = $mem->disp_is_oop(); // disp-as-oop when working with static globals
4279     encode_RegMem(cbuf, rm_byte_opcode, base, index, scale, displace, disp_is_oop);
4280   %}
4281 
4282   enc_class enc_storeLX_volatile( memory mem, stackSlotL src, regXD tmp) %{
4283     { // Atomic long load
4284       // UseXmmLoadAndClearUpper ? movsd $tmp,[$src] : movlpd $tmp,[$src]
4285       emit_opcode(cbuf,UseXmmLoadAndClearUpper ? 0xF2 : 0x66);
4286       emit_opcode(cbuf,0x0F);
4287       emit_opcode(cbuf,UseXmmLoadAndClearUpper ? 0x10 : 0x12);
4288       int base     = $src$$base;
4289       int index    = $src$$index;
4290       int scale    = $src$$scale;
4291       int displace = $src$$disp;
4292       bool disp_is_oop = $src->disp_is_oop(); // disp-as-oop when working with static globals
4293       encode_RegMem(cbuf, $tmp$$reg, base, index, scale, displace, disp_is_oop);
4294     }
4295     cbuf.set_inst_mark();            // Mark start of MOVSD in case $mem has an oop
4296     { // MOVSD $mem,$tmp ! atomic long store
4297       emit_opcode(cbuf,0xF2);
4298       emit_opcode(cbuf,0x0F);
4299       emit_opcode(cbuf,0x11);
4300       int base     = $mem$$base;
4301       int index    = $mem$$index;
4302       int scale    = $mem$$scale;
4303       int displace = $mem$$disp;
4304       bool disp_is_oop = $mem->disp_is_oop(); // disp-as-oop when working with static globals
4305       encode_RegMem(cbuf, $tmp$$reg, base, index, scale, displace, disp_is_oop);
4306     }
4307   %}
4308 
4309   enc_class enc_storeLX_reg_volatile( memory mem, eRegL src, regXD tmp, regXD tmp2) %{
4310     { // MOVD $tmp,$src.lo
4311       emit_opcode(cbuf,0x66);
4312       emit_opcode(cbuf,0x0F);
4313       emit_opcode(cbuf,0x6E);
4314       emit_rm(cbuf, 0x3, $tmp$$reg, $src$$reg);
4315     }
4316     { // MOVD $tmp2,$src.hi
4317       emit_opcode(cbuf,0x66);
4318       emit_opcode(cbuf,0x0F);
4319       emit_opcode(cbuf,0x6E);
4320       emit_rm(cbuf, 0x3, $tmp2$$reg, HIGH_FROM_LOW($src$$reg));
4321     }
4322     { // PUNPCKLDQ $tmp,$tmp2
4323       emit_opcode(cbuf,0x66);
4324       emit_opcode(cbuf,0x0F);
4325       emit_opcode(cbuf,0x62);
4326       emit_rm(cbuf, 0x3, $tmp$$reg, $tmp2$$reg);
4327     }
4328     cbuf.set_inst_mark();            // Mark start of MOVSD in case $mem has an oop
4329     { // MOVSD $mem,$tmp ! atomic long store
4330       emit_opcode(cbuf,0xF2);
4331       emit_opcode(cbuf,0x0F);
4332       emit_opcode(cbuf,0x11);
4333       int base     = $mem$$base;
4334       int index    = $mem$$index;
4335       int scale    = $mem$$scale;
4336       int displace = $mem$$disp;
4337       bool disp_is_oop = $mem->disp_is_oop(); // disp-as-oop when working with static globals
4338       encode_RegMem(cbuf, $tmp$$reg, base, index, scale, displace, disp_is_oop);
4339     }
4340   %}
4341 
4342   // Safepoint Poll.  This polls the safepoint page, and causes an
4343   // exception if it is not readable. Unfortunately, it kills the condition code
4344   // in the process
4345   // We current use TESTL [spp],EDI
4346   // A better choice might be TESTB [spp + pagesize() - CacheLineSize()],0
4347 
4348   enc_class Safepoint_Poll() %{
4349     cbuf.relocate(cbuf.inst_mark(), relocInfo::poll_type, 0);
4350     emit_opcode(cbuf,0x85);
4351     emit_rm (cbuf, 0x0, 0x7, 0x5);
4352     emit_d32(cbuf, (intptr_t)os::get_polling_page());
4353   %}
4354 %}
4355 
4356 
4357 //----------FRAME--------------------------------------------------------------
4358 // Definition of frame structure and management information.
4359 //
4360 //  S T A C K   L A Y O U T    Allocators stack-slot number
4361 //                             |   (to get allocators register number
4362 //  G  Owned by    |        |  v    add OptoReg::stack0())
4363 //  r   CALLER     |        |
4364 //  o     |        +--------+      pad to even-align allocators stack-slot
4365 //  w     V        |  pad0  |        numbers; owned by CALLER
4366 //  t   -----------+--------+----> Matcher::_in_arg_limit, unaligned
4367 //  h     ^        |   in   |  5
4368 //        |        |  args  |  4   Holes in incoming args owned by SELF
4369 //  |     |        |        |  3
4370 //  |     |        +--------+
4371 //  V     |        | old out|      Empty on Intel, window on Sparc
4372 //        |    old |preserve|      Must be even aligned.
4373 //        |     SP-+--------+----> Matcher::_old_SP, even aligned
4374 //        |        |   in   |  3   area for Intel ret address
4375 //     Owned by    |preserve|      Empty on Sparc.
4376 //       SELF      +--------+
4377 //        |        |  pad2  |  2   pad to align old SP
4378 //        |        +--------+  1
4379 //        |        | locks  |  0
4380 //        |        +--------+----> OptoReg::stack0(), even aligned
4381 //        |        |  pad1  | 11   pad to align new SP
4382 //        |        +--------+
4383 //        |        |        | 10
4384 //        |        | spills |  9   spills
4385 //        V        |        |  8   (pad0 slot for callee)
4386 //      -----------+--------+----> Matcher::_out_arg_limit, unaligned
4387 //        ^        |  out   |  7
4388 //        |        |  args  |  6   Holes in outgoing args owned by CALLEE
4389 //     Owned by    +--------+
4390 //      CALLEE     | new out|  6   Empty on Intel, window on Sparc
4391 //        |    new |preserve|      Must be even-aligned.
4392 //        |     SP-+--------+----> Matcher::_new_SP, even aligned
4393 //        |        |        |
4394 //
4395 // Note 1: Only region 8-11 is determined by the allocator.  Region 0-5 is
4396 //         known from SELF's arguments and the Java calling convention.
4397 //         Region 6-7 is determined per call site.
4398 // Note 2: If the calling convention leaves holes in the incoming argument
4399 //         area, those holes are owned by SELF.  Holes in the outgoing area
4400 //         are owned by the CALLEE.  Holes should not be nessecary in the
4401 //         incoming area, as the Java calling convention is completely under
4402 //         the control of the AD file.  Doubles can be sorted and packed to
4403 //         avoid holes.  Holes in the outgoing arguments may be nessecary for
4404 //         varargs C calling conventions.
4405 // Note 3: Region 0-3 is even aligned, with pad2 as needed.  Region 3-5 is
4406 //         even aligned with pad0 as needed.
4407 //         Region 6 is even aligned.  Region 6-7 is NOT even aligned;
4408 //         region 6-11 is even aligned; it may be padded out more so that
4409 //         the region from SP to FP meets the minimum stack alignment.
4410 
4411 frame %{
4412   // What direction does stack grow in (assumed to be same for C & Java)
4413   stack_direction(TOWARDS_LOW);
4414 
4415   // These three registers define part of the calling convention
4416   // between compiled code and the interpreter.
4417   inline_cache_reg(EAX);                // Inline Cache Register
4418   interpreter_method_oop_reg(EBX);      // Method Oop Register when calling interpreter
4419 
4420   // Optional: name the operand used by cisc-spilling to access [stack_pointer + offset]
4421   cisc_spilling_operand_name(indOffset32);
4422 
4423   // Number of stack slots consumed by locking an object
4424   sync_stack_slots(1);
4425 
4426   // Compiled code's Frame Pointer
4427   frame_pointer(ESP);
4428   // Interpreter stores its frame pointer in a register which is
4429   // stored to the stack by I2CAdaptors.
4430   // I2CAdaptors convert from interpreted java to compiled java.
4431   interpreter_frame_pointer(EBP);
4432 
4433   // Stack alignment requirement
4434   // Alignment size in bytes (128-bit -> 16 bytes)
4435   stack_alignment(StackAlignmentInBytes);
4436 
4437   // Number of stack slots between incoming argument block and the start of
4438   // a new frame.  The PROLOG must add this many slots to the stack.  The
4439   // EPILOG must remove this many slots.  Intel needs one slot for
4440   // return address and one for rbp, (must save rbp)
4441   in_preserve_stack_slots(2+VerifyStackAtCalls);
4442 
4443   // Number of outgoing stack slots killed above the out_preserve_stack_slots
4444   // for calls to C.  Supports the var-args backing area for register parms.
4445   varargs_C_out_slots_killed(0);
4446 
4447   // The after-PROLOG location of the return address.  Location of
4448   // return address specifies a type (REG or STACK) and a number
4449   // representing the register number (i.e. - use a register name) or
4450   // stack slot.
4451   // Ret Addr is on stack in slot 0 if no locks or verification or alignment.
4452   // Otherwise, it is above the locks and verification slot and alignment word
4453   return_addr(STACK - 1 +
4454               round_to(1+VerifyStackAtCalls+
4455               Compile::current()->fixed_slots(),
4456               (StackAlignmentInBytes/wordSize)));
4457 
4458   // Body of function which returns an integer array locating
4459   // arguments either in registers or in stack slots.  Passed an array
4460   // of ideal registers called "sig" and a "length" count.  Stack-slot
4461   // offsets are based on outgoing arguments, i.e. a CALLER setting up
4462   // arguments for a CALLEE.  Incoming stack arguments are
4463   // automatically biased by the preserve_stack_slots field above.
4464   calling_convention %{
4465     // No difference between ingoing/outgoing just pass false
4466     SharedRuntime::java_calling_convention(sig_bt, regs, length, false);
4467   %}
4468 
4469 
4470   // Body of function which returns an integer array locating
4471   // arguments either in registers or in stack slots.  Passed an array
4472   // of ideal registers called "sig" and a "length" count.  Stack-slot
4473   // offsets are based on outgoing arguments, i.e. a CALLER setting up
4474   // arguments for a CALLEE.  Incoming stack arguments are
4475   // automatically biased by the preserve_stack_slots field above.
4476   c_calling_convention %{
4477     // This is obviously always outgoing
4478     (void) SharedRuntime::c_calling_convention(sig_bt, regs, length);
4479   %}
4480 
4481   // Location of C & interpreter return values
4482   c_return_value %{
4483     assert( ideal_reg >= Op_RegI && ideal_reg <= Op_RegL, "only return normal values" );
4484     static int lo[Op_RegL+1] = { 0, 0, OptoReg::Bad, EAX_num,      EAX_num,      FPR1L_num,    FPR1L_num, EAX_num };
4485     static int hi[Op_RegL+1] = { 0, 0, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, FPR1H_num, EDX_num };
4486 
4487     // in SSE2+ mode we want to keep the FPU stack clean so pretend
4488     // that C functions return float and double results in XMM0.
4489     if( ideal_reg == Op_RegD && UseSSE>=2 )
4490       return OptoRegPair(XMM0b_num,XMM0a_num);
4491     if( ideal_reg == Op_RegF && UseSSE>=2 )
4492       return OptoRegPair(OptoReg::Bad,XMM0a_num);
4493 
4494     return OptoRegPair(hi[ideal_reg],lo[ideal_reg]);
4495   %}
4496 
4497   // Location of return values
4498   return_value %{
4499     assert( ideal_reg >= Op_RegI && ideal_reg <= Op_RegL, "only return normal values" );
4500     static int lo[Op_RegL+1] = { 0, 0, OptoReg::Bad, EAX_num,      EAX_num,      FPR1L_num,    FPR1L_num, EAX_num };
4501     static int hi[Op_RegL+1] = { 0, 0, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, FPR1H_num, EDX_num };
4502     if( ideal_reg == Op_RegD && UseSSE>=2 )
4503       return OptoRegPair(XMM0b_num,XMM0a_num);
4504     if( ideal_reg == Op_RegF && UseSSE>=1 )
4505       return OptoRegPair(OptoReg::Bad,XMM0a_num);
4506     return OptoRegPair(hi[ideal_reg],lo[ideal_reg]);
4507   %}
4508 
4509 %}
4510 
4511 //----------ATTRIBUTES---------------------------------------------------------
4512 //----------Operand Attributes-------------------------------------------------
4513 op_attrib op_cost(0);        // Required cost attribute
4514 
4515 //----------Instruction Attributes---------------------------------------------
4516 ins_attrib ins_cost(100);       // Required cost attribute
4517 ins_attrib ins_size(8);         // Required size attribute (in bits)
4518 ins_attrib ins_pc_relative(0);  // Required PC Relative flag
4519 ins_attrib ins_short_branch(0); // Required flag: is this instruction a
4520                                 // non-matching short branch variant of some
4521                                                             // long branch?
4522 ins_attrib ins_alignment(1);    // Required alignment attribute (must be a power of 2)
4523                                 // specifies the alignment that some part of the instruction (not
4524                                 // necessarily the start) requires.  If > 1, a compute_padding()
4525                                 // function must be provided for the instruction
4526 
4527 //----------OPERANDS-----------------------------------------------------------
4528 // Operand definitions must precede instruction definitions for correct parsing
4529 // in the ADLC because operands constitute user defined types which are used in
4530 // instruction definitions.
4531 
4532 //----------Simple Operands----------------------------------------------------
4533 // Immediate Operands
4534 // Integer Immediate
4535 operand immI() %{
4536   match(ConI);
4537 
4538   op_cost(10);
4539   format %{ %}
4540   interface(CONST_INTER);
4541 %}
4542 
4543 // Constant for test vs zero
4544 operand immI0() %{
4545   predicate(n->get_int() == 0);
4546   match(ConI);
4547 
4548   op_cost(0);
4549   format %{ %}
4550   interface(CONST_INTER);
4551 %}
4552 
4553 // Constant for increment
4554 operand immI1() %{
4555   predicate(n->get_int() == 1);
4556   match(ConI);
4557 
4558   op_cost(0);
4559   format %{ %}
4560   interface(CONST_INTER);
4561 %}
4562 
4563 // Constant for decrement
4564 operand immI_M1() %{
4565   predicate(n->get_int() == -1);
4566   match(ConI);
4567 
4568   op_cost(0);
4569   format %{ %}
4570   interface(CONST_INTER);
4571 %}
4572 
4573 // Valid scale values for addressing modes
4574 operand immI2() %{
4575   predicate(0 <= n->get_int() && (n->get_int() <= 3));
4576   match(ConI);
4577 
4578   format %{ %}
4579   interface(CONST_INTER);
4580 %}
4581 
4582 operand immI8() %{
4583   predicate((-128 <= n->get_int()) && (n->get_int() <= 127));
4584   match(ConI);
4585 
4586   op_cost(5);
4587   format %{ %}
4588   interface(CONST_INTER);
4589 %}
4590 
4591 operand immI16() %{
4592   predicate((-32768 <= n->get_int()) && (n->get_int() <= 32767));
4593   match(ConI);
4594 
4595   op_cost(10);
4596   format %{ %}
4597   interface(CONST_INTER);
4598 %}
4599 
4600 // Constant for long shifts
4601 operand immI_32() %{
4602   predicate( n->get_int() == 32 );
4603   match(ConI);
4604 
4605   op_cost(0);
4606   format %{ %}
4607   interface(CONST_INTER);
4608 %}
4609 
4610 operand immI_1_31() %{
4611   predicate( n->get_int() >= 1 && n->get_int() <= 31 );
4612   match(ConI);
4613 
4614   op_cost(0);
4615   format %{ %}
4616   interface(CONST_INTER);
4617 %}
4618 
4619 operand immI_32_63() %{
4620   predicate( n->get_int() >= 32 && n->get_int() <= 63 );
4621   match(ConI);
4622   op_cost(0);
4623 
4624   format %{ %}
4625   interface(CONST_INTER);
4626 %}
4627 
4628 operand immI_1() %{
4629   predicate( n->get_int() == 1 );
4630   match(ConI);
4631 
4632   op_cost(0);
4633   format %{ %}
4634   interface(CONST_INTER);
4635 %}
4636 
4637 operand immI_2() %{
4638   predicate( n->get_int() == 2 );
4639   match(ConI);
4640 
4641   op_cost(0);
4642   format %{ %}
4643   interface(CONST_INTER);
4644 %}
4645 
4646 operand immI_3() %{
4647   predicate( n->get_int() == 3 );
4648   match(ConI);
4649 
4650   op_cost(0);
4651   format %{ %}
4652   interface(CONST_INTER);
4653 %}
4654 
4655 // Pointer Immediate
4656 operand immP() %{
4657   match(ConP);
4658 
4659   op_cost(10);
4660   format %{ %}
4661   interface(CONST_INTER);
4662 %}
4663 
4664 // NULL Pointer Immediate
4665 operand immP0() %{
4666   predicate( n->get_ptr() == 0 );
4667   match(ConP);
4668   op_cost(0);
4669 
4670   format %{ %}
4671   interface(CONST_INTER);
4672 %}
4673 
4674 // Long Immediate
4675 operand immL() %{
4676   match(ConL);
4677 
4678   op_cost(20);
4679   format %{ %}
4680   interface(CONST_INTER);
4681 %}
4682 
4683 // Long Immediate zero
4684 operand immL0() %{
4685   predicate( n->get_long() == 0L );
4686   match(ConL);
4687   op_cost(0);
4688 
4689   format %{ %}
4690   interface(CONST_INTER);
4691 %}
4692 
4693 // Long Immediate zero
4694 operand immL_M1() %{
4695   predicate( n->get_long() == -1L );
4696   match(ConL);
4697   op_cost(0);
4698 
4699   format %{ %}
4700   interface(CONST_INTER);
4701 %}
4702 
4703 // Long immediate from 0 to 127.
4704 // Used for a shorter form of long mul by 10.
4705 operand immL_127() %{
4706   predicate((0 <= n->get_long()) && (n->get_long() <= 127));
4707   match(ConL);
4708   op_cost(0);
4709 
4710   format %{ %}
4711   interface(CONST_INTER);
4712 %}
4713 
4714 // Long Immediate: low 32-bit mask
4715 operand immL_32bits() %{
4716   predicate(n->get_long() == 0xFFFFFFFFL);
4717   match(ConL);
4718   op_cost(0);
4719 
4720   format %{ %}
4721   interface(CONST_INTER);
4722 %}
4723 
4724 // Long Immediate: low 32-bit mask
4725 operand immL32() %{
4726   predicate(n->get_long() == (int)(n->get_long()));
4727   match(ConL);
4728   op_cost(20);
4729 
4730   format %{ %}
4731   interface(CONST_INTER);
4732 %}
4733 
4734 //Double Immediate zero
4735 operand immD0() %{
4736   // Do additional (and counter-intuitive) test against NaN to work around VC++
4737   // bug that generates code such that NaNs compare equal to 0.0
4738   predicate( UseSSE<=1 && n->getd() == 0.0 && !g_isnan(n->getd()) );
4739   match(ConD);
4740 
4741   op_cost(5);
4742   format %{ %}
4743   interface(CONST_INTER);
4744 %}
4745 
4746 // Double Immediate
4747 operand immD1() %{
4748   predicate( UseSSE<=1 && n->getd() == 1.0 );
4749   match(ConD);
4750 
4751   op_cost(5);
4752   format %{ %}
4753   interface(CONST_INTER);
4754 %}
4755 
4756 // Double Immediate
4757 operand immD() %{
4758   predicate(UseSSE<=1);
4759   match(ConD);
4760 
4761   op_cost(5);
4762   format %{ %}
4763   interface(CONST_INTER);
4764 %}
4765 
4766 operand immXD() %{
4767   predicate(UseSSE>=2);
4768   match(ConD);
4769 
4770   op_cost(5);
4771   format %{ %}
4772   interface(CONST_INTER);
4773 %}
4774 
4775 // Double Immediate zero
4776 operand immXD0() %{
4777   // Do additional (and counter-intuitive) test against NaN to work around VC++
4778   // bug that generates code such that NaNs compare equal to 0.0 AND do not
4779   // compare equal to -0.0.
4780   predicate( UseSSE>=2 && jlong_cast(n->getd()) == 0 );
4781   match(ConD);
4782 
4783   format %{ %}
4784   interface(CONST_INTER);
4785 %}
4786 
4787 // Float Immediate zero
4788 operand immF0() %{
4789   predicate( UseSSE == 0 && n->getf() == 0.0 );
4790   match(ConF);
4791 
4792   op_cost(5);
4793   format %{ %}
4794   interface(CONST_INTER);
4795 %}
4796 
4797 // Float Immediate
4798 operand immF() %{
4799   predicate( UseSSE == 0 );
4800   match(ConF);
4801 
4802   op_cost(5);
4803   format %{ %}
4804   interface(CONST_INTER);
4805 %}
4806 
4807 // Float Immediate
4808 operand immXF() %{
4809   predicate(UseSSE >= 1);
4810   match(ConF);
4811 
4812   op_cost(5);
4813   format %{ %}
4814   interface(CONST_INTER);
4815 %}
4816 
4817 // Float Immediate zero.  Zero and not -0.0
4818 operand immXF0() %{
4819   predicate( UseSSE >= 1 && jint_cast(n->getf()) == 0 );
4820   match(ConF);
4821 
4822   op_cost(5);
4823   format %{ %}
4824   interface(CONST_INTER);
4825 %}
4826 
4827 // Immediates for special shifts (sign extend)
4828 
4829 // Constants for increment
4830 operand immI_16() %{
4831   predicate( n->get_int() == 16 );
4832   match(ConI);
4833 
4834   format %{ %}
4835   interface(CONST_INTER);
4836 %}
4837 
4838 operand immI_24() %{
4839   predicate( n->get_int() == 24 );
4840   match(ConI);
4841 
4842   format %{ %}
4843   interface(CONST_INTER);
4844 %}
4845 
4846 // Constant for byte-wide masking
4847 operand immI_255() %{
4848   predicate( n->get_int() == 255 );
4849   match(ConI);
4850 
4851   format %{ %}
4852   interface(CONST_INTER);
4853 %}
4854 
4855 // Constant for short-wide masking
4856 operand immI_65535() %{
4857   predicate(n->get_int() == 65535);
4858   match(ConI);
4859 
4860   format %{ %}
4861   interface(CONST_INTER);
4862 %}
4863 
4864 // Register Operands
4865 // Integer Register
4866 operand eRegI() %{
4867   constraint(ALLOC_IN_RC(e_reg));
4868   match(RegI);
4869   match(xRegI);
4870   match(eAXRegI);
4871   match(eBXRegI);
4872   match(eCXRegI);
4873   match(eDXRegI);
4874   match(eDIRegI);
4875   match(eSIRegI);
4876 
4877   format %{ %}
4878   interface(REG_INTER);
4879 %}
4880 
4881 // Subset of Integer Register
4882 operand xRegI(eRegI reg) %{
4883   constraint(ALLOC_IN_RC(x_reg));
4884   match(reg);
4885   match(eAXRegI);
4886   match(eBXRegI);
4887   match(eCXRegI);
4888   match(eDXRegI);
4889 
4890   format %{ %}
4891   interface(REG_INTER);
4892 %}
4893 
4894 // Special Registers
4895 operand eAXRegI(xRegI reg) %{
4896   constraint(ALLOC_IN_RC(eax_reg));
4897   match(reg);
4898   match(eRegI);
4899 
4900   format %{ "EAX" %}
4901   interface(REG_INTER);
4902 %}
4903 
4904 // Special Registers
4905 operand eBXRegI(xRegI reg) %{
4906   constraint(ALLOC_IN_RC(ebx_reg));
4907   match(reg);
4908   match(eRegI);
4909 
4910   format %{ "EBX" %}
4911   interface(REG_INTER);
4912 %}
4913 
4914 operand eCXRegI(xRegI reg) %{
4915   constraint(ALLOC_IN_RC(ecx_reg));
4916   match(reg);
4917   match(eRegI);
4918 
4919   format %{ "ECX" %}
4920   interface(REG_INTER);
4921 %}
4922 
4923 operand eDXRegI(xRegI reg) %{
4924   constraint(ALLOC_IN_RC(edx_reg));
4925   match(reg);
4926   match(eRegI);
4927 
4928   format %{ "EDX" %}
4929   interface(REG_INTER);
4930 %}
4931 
4932 operand eDIRegI(xRegI reg) %{
4933   constraint(ALLOC_IN_RC(edi_reg));
4934   match(reg);
4935   match(eRegI);
4936 
4937   format %{ "EDI" %}
4938   interface(REG_INTER);
4939 %}
4940 
4941 operand naxRegI() %{
4942   constraint(ALLOC_IN_RC(nax_reg));
4943   match(RegI);
4944   match(eCXRegI);
4945   match(eDXRegI);
4946   match(eSIRegI);
4947   match(eDIRegI);
4948 
4949   format %{ %}
4950   interface(REG_INTER);
4951 %}
4952 
4953 operand nadxRegI() %{
4954   constraint(ALLOC_IN_RC(nadx_reg));
4955   match(RegI);
4956   match(eBXRegI);
4957   match(eCXRegI);
4958   match(eSIRegI);
4959   match(eDIRegI);
4960 
4961   format %{ %}
4962   interface(REG_INTER);
4963 %}
4964 
4965 operand ncxRegI() %{
4966   constraint(ALLOC_IN_RC(ncx_reg));
4967   match(RegI);
4968   match(eAXRegI);
4969   match(eDXRegI);
4970   match(eSIRegI);
4971   match(eDIRegI);
4972 
4973   format %{ %}
4974   interface(REG_INTER);
4975 %}
4976 
4977 // // This operand was used by cmpFastUnlock, but conflicted with 'object' reg
4978 // //
4979 operand eSIRegI(xRegI reg) %{
4980    constraint(ALLOC_IN_RC(esi_reg));
4981    match(reg);
4982    match(eRegI);
4983 
4984    format %{ "ESI" %}
4985    interface(REG_INTER);
4986 %}
4987 
4988 // Pointer Register
4989 operand anyRegP() %{
4990   constraint(ALLOC_IN_RC(any_reg));
4991   match(RegP);
4992   match(eAXRegP);
4993   match(eBXRegP);
4994   match(eCXRegP);
4995   match(eDIRegP);
4996   match(eRegP);
4997 
4998   format %{ %}
4999   interface(REG_INTER);
5000 %}
5001 
5002 operand eRegP() %{
5003   constraint(ALLOC_IN_RC(e_reg));
5004   match(RegP);
5005   match(eAXRegP);
5006   match(eBXRegP);
5007   match(eCXRegP);
5008   match(eDIRegP);
5009 
5010   format %{ %}
5011   interface(REG_INTER);
5012 %}
5013 
5014 // On windows95, EBP is not safe to use for implicit null tests.
5015 operand eRegP_no_EBP() %{
5016   constraint(ALLOC_IN_RC(e_reg_no_rbp));
5017   match(RegP);
5018   match(eAXRegP);
5019   match(eBXRegP);
5020   match(eCXRegP);
5021   match(eDIRegP);
5022 
5023   op_cost(100);
5024   format %{ %}
5025   interface(REG_INTER);
5026 %}
5027 
5028 operand naxRegP() %{
5029   constraint(ALLOC_IN_RC(nax_reg));
5030   match(RegP);
5031   match(eBXRegP);
5032   match(eDXRegP);
5033   match(eCXRegP);
5034   match(eSIRegP);
5035   match(eDIRegP);
5036 
5037   format %{ %}
5038   interface(REG_INTER);
5039 %}
5040 
5041 operand nabxRegP() %{
5042   constraint(ALLOC_IN_RC(nabx_reg));
5043   match(RegP);
5044   match(eCXRegP);
5045   match(eDXRegP);
5046   match(eSIRegP);
5047   match(eDIRegP);
5048 
5049   format %{ %}
5050   interface(REG_INTER);
5051 %}
5052 
5053 operand pRegP() %{
5054   constraint(ALLOC_IN_RC(p_reg));
5055   match(RegP);
5056   match(eBXRegP);
5057   match(eDXRegP);
5058   match(eSIRegP);
5059   match(eDIRegP);
5060 
5061   format %{ %}
5062   interface(REG_INTER);
5063 %}
5064 
5065 // Special Registers
5066 // Return a pointer value
5067 operand eAXRegP(eRegP reg) %{
5068   constraint(ALLOC_IN_RC(eax_reg));
5069   match(reg);
5070   format %{ "EAX" %}
5071   interface(REG_INTER);
5072 %}
5073 
5074 // Used in AtomicAdd
5075 operand eBXRegP(eRegP reg) %{
5076   constraint(ALLOC_IN_RC(ebx_reg));
5077   match(reg);
5078   format %{ "EBX" %}
5079   interface(REG_INTER);
5080 %}
5081 
5082 // Tail-call (interprocedural jump) to interpreter
5083 operand eCXRegP(eRegP reg) %{
5084   constraint(ALLOC_IN_RC(ecx_reg));
5085   match(reg);
5086   format %{ "ECX" %}
5087   interface(REG_INTER);
5088 %}
5089 
5090 operand eSIRegP(eRegP reg) %{
5091   constraint(ALLOC_IN_RC(esi_reg));
5092   match(reg);
5093   format %{ "ESI" %}
5094   interface(REG_INTER);
5095 %}
5096 
5097 // Used in rep stosw
5098 operand eDIRegP(eRegP reg) %{
5099   constraint(ALLOC_IN_RC(edi_reg));
5100   match(reg);
5101   format %{ "EDI" %}
5102   interface(REG_INTER);
5103 %}
5104 
5105 operand eBPRegP() %{
5106   constraint(ALLOC_IN_RC(ebp_reg));
5107   match(RegP);
5108   format %{ "EBP" %}
5109   interface(REG_INTER);
5110 %}
5111 
5112 operand eRegL() %{
5113   constraint(ALLOC_IN_RC(long_reg));
5114   match(RegL);
5115   match(eADXRegL);
5116 
5117   format %{ %}
5118   interface(REG_INTER);
5119 %}
5120 
5121 operand eADXRegL( eRegL reg ) %{
5122   constraint(ALLOC_IN_RC(eadx_reg));
5123   match(reg);
5124 
5125   format %{ "EDX:EAX" %}
5126   interface(REG_INTER);
5127 %}
5128 
5129 operand eBCXRegL( eRegL reg ) %{
5130   constraint(ALLOC_IN_RC(ebcx_reg));
5131   match(reg);
5132 
5133   format %{ "EBX:ECX" %}
5134   interface(REG_INTER);
5135 %}
5136 
5137 // Special case for integer high multiply
5138 operand eADXRegL_low_only() %{
5139   constraint(ALLOC_IN_RC(eadx_reg));
5140   match(RegL);
5141 
5142   format %{ "EAX" %}
5143   interface(REG_INTER);
5144 %}
5145 
5146 // Flags register, used as output of compare instructions
5147 operand eFlagsReg() %{
5148   constraint(ALLOC_IN_RC(int_flags));
5149   match(RegFlags);
5150 
5151   format %{ "EFLAGS" %}
5152   interface(REG_INTER);
5153 %}
5154 
5155 // Flags register, used as output of FLOATING POINT compare instructions
5156 operand eFlagsRegU() %{
5157   constraint(ALLOC_IN_RC(int_flags));
5158   match(RegFlags);
5159 
5160   format %{ "EFLAGS_U" %}
5161   interface(REG_INTER);
5162 %}
5163 
5164 operand eFlagsRegUCF() %{
5165   constraint(ALLOC_IN_RC(int_flags));
5166   match(RegFlags);
5167   predicate(false);
5168 
5169   format %{ "EFLAGS_U_CF" %}
5170   interface(REG_INTER);
5171 %}
5172 
5173 // Condition Code Register used by long compare
5174 operand flagsReg_long_LTGE() %{
5175   constraint(ALLOC_IN_RC(int_flags));
5176   match(RegFlags);
5177   format %{ "FLAGS_LTGE" %}
5178   interface(REG_INTER);
5179 %}
5180 operand flagsReg_long_EQNE() %{
5181   constraint(ALLOC_IN_RC(int_flags));
5182   match(RegFlags);
5183   format %{ "FLAGS_EQNE" %}
5184   interface(REG_INTER);
5185 %}
5186 operand flagsReg_long_LEGT() %{
5187   constraint(ALLOC_IN_RC(int_flags));
5188   match(RegFlags);
5189   format %{ "FLAGS_LEGT" %}
5190   interface(REG_INTER);
5191 %}
5192 
5193 // Float register operands
5194 operand regD() %{
5195   predicate( UseSSE < 2 );
5196   constraint(ALLOC_IN_RC(dbl_reg));
5197   match(RegD);
5198   match(regDPR1);
5199   match(regDPR2);
5200   format %{ %}
5201   interface(REG_INTER);
5202 %}
5203 
5204 operand regDPR1(regD reg) %{
5205   predicate( UseSSE < 2 );
5206   constraint(ALLOC_IN_RC(dbl_reg0));
5207   match(reg);
5208   format %{ "FPR1" %}
5209   interface(REG_INTER);
5210 %}
5211 
5212 operand regDPR2(regD reg) %{
5213   predicate( UseSSE < 2 );
5214   constraint(ALLOC_IN_RC(dbl_reg1));
5215   match(reg);
5216   format %{ "FPR2" %}
5217   interface(REG_INTER);
5218 %}
5219 
5220 operand regnotDPR1(regD reg) %{
5221   predicate( UseSSE < 2 );
5222   constraint(ALLOC_IN_RC(dbl_notreg0));
5223   match(reg);
5224   format %{ %}
5225   interface(REG_INTER);
5226 %}
5227 
5228 // XMM Double register operands
5229 operand regXD() %{
5230   predicate( UseSSE>=2 );
5231   constraint(ALLOC_IN_RC(xdb_reg));
5232   match(RegD);
5233   match(regXD6);
5234   match(regXD7);
5235   format %{ %}
5236   interface(REG_INTER);
5237 %}
5238 
5239 // XMM6 double register operands
5240 operand regXD6(regXD reg) %{
5241   predicate( UseSSE>=2 );
5242   constraint(ALLOC_IN_RC(xdb_reg6));
5243   match(reg);
5244   format %{ "XMM6" %}
5245   interface(REG_INTER);
5246 %}
5247 
5248 // XMM7 double register operands
5249 operand regXD7(regXD reg) %{
5250   predicate( UseSSE>=2 );
5251   constraint(ALLOC_IN_RC(xdb_reg7));
5252   match(reg);
5253   format %{ "XMM7" %}
5254   interface(REG_INTER);
5255 %}
5256 
5257 // Float register operands
5258 operand regF() %{
5259   predicate( UseSSE < 2 );
5260   constraint(ALLOC_IN_RC(flt_reg));
5261   match(RegF);
5262   match(regFPR1);
5263   format %{ %}
5264   interface(REG_INTER);
5265 %}
5266 
5267 // Float register operands
5268 operand regFPR1(regF reg) %{
5269   predicate( UseSSE < 2 );
5270   constraint(ALLOC_IN_RC(flt_reg0));
5271   match(reg);
5272   format %{ "FPR1" %}
5273   interface(REG_INTER);
5274 %}
5275 
5276 // XMM register operands
5277 operand regX() %{
5278   predicate( UseSSE>=1 );
5279   constraint(ALLOC_IN_RC(xmm_reg));
5280   match(RegF);
5281   format %{ %}
5282   interface(REG_INTER);
5283 %}
5284 
5285 
5286 //----------Memory Operands----------------------------------------------------
5287 // Direct Memory Operand
5288 operand direct(immP addr) %{
5289   match(addr);
5290 
5291   format %{ "[$addr]" %}
5292   interface(MEMORY_INTER) %{
5293     base(0xFFFFFFFF);
5294     index(0x4);
5295     scale(0x0);
5296     disp($addr);
5297   %}
5298 %}
5299 
5300 // Indirect Memory Operand
5301 operand indirect(eRegP reg) %{
5302   constraint(ALLOC_IN_RC(e_reg));
5303   match(reg);
5304 
5305   format %{ "[$reg]" %}
5306   interface(MEMORY_INTER) %{
5307     base($reg);
5308     index(0x4);
5309     scale(0x0);
5310     disp(0x0);
5311   %}
5312 %}
5313 
5314 // Indirect Memory Plus Short Offset Operand
5315 operand indOffset8(eRegP reg, immI8 off) %{
5316   match(AddP reg off);
5317 
5318   format %{ "[$reg + $off]" %}
5319   interface(MEMORY_INTER) %{
5320     base($reg);
5321     index(0x4);
5322     scale(0x0);
5323     disp($off);
5324   %}
5325 %}
5326 
5327 // Indirect Memory Plus Long Offset Operand
5328 operand indOffset32(eRegP reg, immI off) %{
5329   match(AddP reg off);
5330 
5331   format %{ "[$reg + $off]" %}
5332   interface(MEMORY_INTER) %{
5333     base($reg);
5334     index(0x4);
5335     scale(0x0);
5336     disp($off);
5337   %}
5338 %}
5339 
5340 // Indirect Memory Plus Long Offset Operand
5341 operand indOffset32X(eRegI reg, immP off) %{
5342   match(AddP off reg);
5343 
5344   format %{ "[$reg + $off]" %}
5345   interface(MEMORY_INTER) %{
5346     base($reg);
5347     index(0x4);
5348     scale(0x0);
5349     disp($off);
5350   %}
5351 %}
5352 
5353 // Indirect Memory Plus Index Register Plus Offset Operand
5354 operand indIndexOffset(eRegP reg, eRegI ireg, immI off) %{
5355   match(AddP (AddP reg ireg) off);
5356 
5357   op_cost(10);
5358   format %{"[$reg + $off + $ireg]" %}
5359   interface(MEMORY_INTER) %{
5360     base($reg);
5361     index($ireg);
5362     scale(0x0);
5363     disp($off);
5364   %}
5365 %}
5366 
5367 // Indirect Memory Plus Index Register Plus Offset Operand
5368 operand indIndex(eRegP reg, eRegI ireg) %{
5369   match(AddP reg ireg);
5370 
5371   op_cost(10);
5372   format %{"[$reg + $ireg]" %}
5373   interface(MEMORY_INTER) %{
5374     base($reg);
5375     index($ireg);
5376     scale(0x0);
5377     disp(0x0);
5378   %}
5379 %}
5380 
5381 // // -------------------------------------------------------------------------
5382 // // 486 architecture doesn't support "scale * index + offset" with out a base
5383 // // -------------------------------------------------------------------------
5384 // // Scaled Memory Operands
5385 // // Indirect Memory Times Scale Plus Offset Operand
5386 // operand indScaleOffset(immP off, eRegI ireg, immI2 scale) %{
5387 //   match(AddP off (LShiftI ireg scale));
5388 //
5389 //   op_cost(10);
5390 //   format %{"[$off + $ireg << $scale]" %}
5391 //   interface(MEMORY_INTER) %{
5392 //     base(0x4);
5393 //     index($ireg);
5394 //     scale($scale);
5395 //     disp($off);
5396 //   %}
5397 // %}
5398 
5399 // Indirect Memory Times Scale Plus Index Register
5400 operand indIndexScale(eRegP reg, eRegI ireg, immI2 scale) %{
5401   match(AddP reg (LShiftI ireg scale));
5402 
5403   op_cost(10);
5404   format %{"[$reg + $ireg << $scale]" %}
5405   interface(MEMORY_INTER) %{
5406     base($reg);
5407     index($ireg);
5408     scale($scale);
5409     disp(0x0);
5410   %}
5411 %}
5412 
5413 // Indirect Memory Times Scale Plus Index Register Plus Offset Operand
5414 operand indIndexScaleOffset(eRegP reg, immI off, eRegI ireg, immI2 scale) %{
5415   match(AddP (AddP reg (LShiftI ireg scale)) off);
5416 
5417   op_cost(10);
5418   format %{"[$reg + $off + $ireg << $scale]" %}
5419   interface(MEMORY_INTER) %{
5420     base($reg);
5421     index($ireg);
5422     scale($scale);
5423     disp($off);
5424   %}
5425 %}
5426 
5427 //----------Load Long Memory Operands------------------------------------------
5428 // The load-long idiom will use it's address expression again after loading
5429 // the first word of the long.  If the load-long destination overlaps with
5430 // registers used in the addressing expression, the 2nd half will be loaded
5431 // from a clobbered address.  Fix this by requiring that load-long use
5432 // address registers that do not overlap with the load-long target.
5433 
5434 // load-long support
5435 operand load_long_RegP() %{
5436   constraint(ALLOC_IN_RC(esi_reg));
5437   match(RegP);
5438   match(eSIRegP);
5439   op_cost(100);
5440   format %{  %}
5441   interface(REG_INTER);
5442 %}
5443 
5444 // Indirect Memory Operand Long
5445 operand load_long_indirect(load_long_RegP reg) %{
5446   constraint(ALLOC_IN_RC(esi_reg));
5447   match(reg);
5448 
5449   format %{ "[$reg]" %}
5450   interface(MEMORY_INTER) %{
5451     base($reg);
5452     index(0x4);
5453     scale(0x0);
5454     disp(0x0);
5455   %}
5456 %}
5457 
5458 // Indirect Memory Plus Long Offset Operand
5459 operand load_long_indOffset32(load_long_RegP reg, immI off) %{
5460   match(AddP reg off);
5461 
5462   format %{ "[$reg + $off]" %}
5463   interface(MEMORY_INTER) %{
5464     base($reg);
5465     index(0x4);
5466     scale(0x0);
5467     disp($off);
5468   %}
5469 %}
5470 
5471 opclass load_long_memory(load_long_indirect, load_long_indOffset32);
5472 
5473 
5474 //----------Special Memory Operands--------------------------------------------
5475 // Stack Slot Operand - This operand is used for loading and storing temporary
5476 //                      values on the stack where a match requires a value to
5477 //                      flow through memory.
5478 operand stackSlotP(sRegP reg) %{
5479   constraint(ALLOC_IN_RC(stack_slots));
5480   // No match rule because this operand is only generated in matching
5481   format %{ "[$reg]" %}
5482   interface(MEMORY_INTER) %{
5483     base(0x4);   // ESP
5484     index(0x4);  // No Index
5485     scale(0x0);  // No Scale
5486     disp($reg);  // Stack Offset
5487   %}
5488 %}
5489 
5490 operand stackSlotI(sRegI reg) %{
5491   constraint(ALLOC_IN_RC(stack_slots));
5492   // No match rule because this operand is only generated in matching
5493   format %{ "[$reg]" %}
5494   interface(MEMORY_INTER) %{
5495     base(0x4);   // ESP
5496     index(0x4);  // No Index
5497     scale(0x0);  // No Scale
5498     disp($reg);  // Stack Offset
5499   %}
5500 %}
5501 
5502 operand stackSlotF(sRegF reg) %{
5503   constraint(ALLOC_IN_RC(stack_slots));
5504   // No match rule because this operand is only generated in matching
5505   format %{ "[$reg]" %}
5506   interface(MEMORY_INTER) %{
5507     base(0x4);   // ESP
5508     index(0x4);  // No Index
5509     scale(0x0);  // No Scale
5510     disp($reg);  // Stack Offset
5511   %}
5512 %}
5513 
5514 operand stackSlotD(sRegD reg) %{
5515   constraint(ALLOC_IN_RC(stack_slots));
5516   // No match rule because this operand is only generated in matching
5517   format %{ "[$reg]" %}
5518   interface(MEMORY_INTER) %{
5519     base(0x4);   // ESP
5520     index(0x4);  // No Index
5521     scale(0x0);  // No Scale
5522     disp($reg);  // Stack Offset
5523   %}
5524 %}
5525 
5526 operand stackSlotL(sRegL reg) %{
5527   constraint(ALLOC_IN_RC(stack_slots));
5528   // No match rule because this operand is only generated in matching
5529   format %{ "[$reg]" %}
5530   interface(MEMORY_INTER) %{
5531     base(0x4);   // ESP
5532     index(0x4);  // No Index
5533     scale(0x0);  // No Scale
5534     disp($reg);  // Stack Offset
5535   %}
5536 %}
5537 
5538 //----------Memory Operands - Win95 Implicit Null Variants----------------
5539 // Indirect Memory Operand
5540 operand indirect_win95_safe(eRegP_no_EBP reg)
5541 %{
5542   constraint(ALLOC_IN_RC(e_reg));
5543   match(reg);
5544 
5545   op_cost(100);
5546   format %{ "[$reg]" %}
5547   interface(MEMORY_INTER) %{
5548     base($reg);
5549     index(0x4);
5550     scale(0x0);
5551     disp(0x0);
5552   %}
5553 %}
5554 
5555 // Indirect Memory Plus Short Offset Operand
5556 operand indOffset8_win95_safe(eRegP_no_EBP reg, immI8 off)
5557 %{
5558   match(AddP reg off);
5559 
5560   op_cost(100);
5561   format %{ "[$reg + $off]" %}
5562   interface(MEMORY_INTER) %{
5563     base($reg);
5564     index(0x4);
5565     scale(0x0);
5566     disp($off);
5567   %}
5568 %}
5569 
5570 // Indirect Memory Plus Long Offset Operand
5571 operand indOffset32_win95_safe(eRegP_no_EBP reg, immI off)
5572 %{
5573   match(AddP reg off);
5574 
5575   op_cost(100);
5576   format %{ "[$reg + $off]" %}
5577   interface(MEMORY_INTER) %{
5578     base($reg);
5579     index(0x4);
5580     scale(0x0);
5581     disp($off);
5582   %}
5583 %}
5584 
5585 // Indirect Memory Plus Index Register Plus Offset Operand
5586 operand indIndexOffset_win95_safe(eRegP_no_EBP reg, eRegI ireg, immI off)
5587 %{
5588   match(AddP (AddP reg ireg) off);
5589 
5590   op_cost(100);
5591   format %{"[$reg + $off + $ireg]" %}
5592   interface(MEMORY_INTER) %{
5593     base($reg);
5594     index($ireg);
5595     scale(0x0);
5596     disp($off);
5597   %}
5598 %}
5599 
5600 // Indirect Memory Times Scale Plus Index Register
5601 operand indIndexScale_win95_safe(eRegP_no_EBP reg, eRegI ireg, immI2 scale)
5602 %{
5603   match(AddP reg (LShiftI ireg scale));
5604 
5605   op_cost(100);
5606   format %{"[$reg + $ireg << $scale]" %}
5607   interface(MEMORY_INTER) %{
5608     base($reg);
5609     index($ireg);
5610     scale($scale);
5611     disp(0x0);
5612   %}
5613 %}
5614 
5615 // Indirect Memory Times Scale Plus Index Register Plus Offset Operand
5616 operand indIndexScaleOffset_win95_safe(eRegP_no_EBP reg, immI off, eRegI ireg, immI2 scale)
5617 %{
5618   match(AddP (AddP reg (LShiftI ireg scale)) off);
5619 
5620   op_cost(100);
5621   format %{"[$reg + $off + $ireg << $scale]" %}
5622   interface(MEMORY_INTER) %{
5623     base($reg);
5624     index($ireg);
5625     scale($scale);
5626     disp($off);
5627   %}
5628 %}
5629 
5630 //----------Conditional Branch Operands----------------------------------------
5631 // Comparison Op  - This is the operation of the comparison, and is limited to
5632 //                  the following set of codes:
5633 //                  L (<), LE (<=), G (>), GE (>=), E (==), NE (!=)
5634 //
5635 // Other attributes of the comparison, such as unsignedness, are specified
5636 // by the comparison instruction that sets a condition code flags register.
5637 // That result is represented by a flags operand whose subtype is appropriate
5638 // to the unsignedness (etc.) of the comparison.
5639 //
5640 // Later, the instruction which matches both the Comparison Op (a Bool) and
5641 // the flags (produced by the Cmp) specifies the coding of the comparison op
5642 // by matching a specific subtype of Bool operand below, such as cmpOpU.
5643 
5644 // Comparision Code
5645 operand cmpOp() %{
5646   match(Bool);
5647 
5648   format %{ "" %}
5649   interface(COND_INTER) %{
5650     equal(0x4, "e");
5651     not_equal(0x5, "ne");
5652     less(0xC, "l");
5653     greater_equal(0xD, "ge");
5654     less_equal(0xE, "le");
5655     greater(0xF, "g");
5656   %}
5657 %}
5658 
5659 // Comparison Code, unsigned compare.  Used by FP also, with
5660 // C2 (unordered) turned into GT or LT already.  The other bits
5661 // C0 and C3 are turned into Carry & Zero flags.
5662 operand cmpOpU() %{
5663   match(Bool);
5664 
5665   format %{ "" %}
5666   interface(COND_INTER) %{
5667     equal(0x4, "e");
5668     not_equal(0x5, "ne");
5669     less(0x2, "b");
5670     greater_equal(0x3, "nb");
5671     less_equal(0x6, "be");
5672     greater(0x7, "nbe");
5673   %}
5674 %}
5675 
5676 // Floating comparisons that don't require any fixup for the unordered case
5677 operand cmpOpUCF() %{
5678   match(Bool);
5679   predicate(n->as_Bool()->_test._test == BoolTest::lt ||
5680             n->as_Bool()->_test._test == BoolTest::ge ||
5681             n->as_Bool()->_test._test == BoolTest::le ||
5682             n->as_Bool()->_test._test == BoolTest::gt);
5683   format %{ "" %}
5684   interface(COND_INTER) %{
5685     equal(0x4, "e");
5686     not_equal(0x5, "ne");
5687     less(0x2, "b");
5688     greater_equal(0x3, "nb");
5689     less_equal(0x6, "be");
5690     greater(0x7, "nbe");
5691   %}
5692 %}
5693 
5694 
5695 // Floating comparisons that can be fixed up with extra conditional jumps
5696 operand cmpOpUCF2() %{
5697   match(Bool);
5698   predicate(n->as_Bool()->_test._test == BoolTest::ne ||
5699             n->as_Bool()->_test._test == BoolTest::eq);
5700   format %{ "" %}
5701   interface(COND_INTER) %{
5702     equal(0x4, "e");
5703     not_equal(0x5, "ne");
5704     less(0x2, "b");
5705     greater_equal(0x3, "nb");
5706     less_equal(0x6, "be");
5707     greater(0x7, "nbe");
5708   %}
5709 %}
5710 
5711 // Comparison Code for FP conditional move
5712 operand cmpOp_fcmov() %{
5713   match(Bool);
5714 
5715   format %{ "" %}
5716   interface(COND_INTER) %{
5717     equal        (0x0C8);
5718     not_equal    (0x1C8);
5719     less         (0x0C0);
5720     greater_equal(0x1C0);
5721     less_equal   (0x0D0);
5722     greater      (0x1D0);
5723   %}
5724 %}
5725 
5726 // Comparision Code used in long compares
5727 operand cmpOp_commute() %{
5728   match(Bool);
5729 
5730   format %{ "" %}
5731   interface(COND_INTER) %{
5732     equal(0x4, "e");
5733     not_equal(0x5, "ne");
5734     less(0xF, "g");
5735     greater_equal(0xE, "le");
5736     less_equal(0xD, "ge");
5737     greater(0xC, "l");
5738   %}
5739 %}
5740 
5741 //----------OPERAND CLASSES----------------------------------------------------
5742 // Operand Classes are groups of operands that are used as to simplify
5743 // instruction definitions by not requiring the AD writer to specify separate
5744 // instructions for every form of operand when the instruction accepts
5745 // multiple operand types with the same basic encoding and format.  The classic
5746 // case of this is memory operands.
5747 
5748 opclass memory(direct, indirect, indOffset8, indOffset32, indOffset32X, indIndexOffset,
5749                indIndex, indIndexScale, indIndexScaleOffset);
5750 
5751 // Long memory operations are encoded in 2 instructions and a +4 offset.
5752 // This means some kind of offset is always required and you cannot use
5753 // an oop as the offset (done when working on static globals).
5754 opclass long_memory(direct, indirect, indOffset8, indOffset32, indIndexOffset,
5755                     indIndex, indIndexScale, indIndexScaleOffset);
5756 
5757 
5758 //----------PIPELINE-----------------------------------------------------------
5759 // Rules which define the behavior of the target architectures pipeline.
5760 pipeline %{
5761 
5762 //----------ATTRIBUTES---------------------------------------------------------
5763 attributes %{
5764   variable_size_instructions;        // Fixed size instructions
5765   max_instructions_per_bundle = 3;   // Up to 3 instructions per bundle
5766   instruction_unit_size = 1;         // An instruction is 1 bytes long
5767   instruction_fetch_unit_size = 16;  // The processor fetches one line
5768   instruction_fetch_units = 1;       // of 16 bytes
5769 
5770   // List of nop instructions
5771   nops( MachNop );
5772 %}
5773 
5774 //----------RESOURCES----------------------------------------------------------
5775 // Resources are the functional units available to the machine
5776 
5777 // Generic P2/P3 pipeline
5778 // 3 decoders, only D0 handles big operands; a "bundle" is the limit of
5779 // 3 instructions decoded per cycle.
5780 // 2 load/store ops per cycle, 1 branch, 1 FPU,
5781 // 2 ALU op, only ALU0 handles mul/div instructions.
5782 resources( D0, D1, D2, DECODE = D0 | D1 | D2,
5783            MS0, MS1, MEM = MS0 | MS1,
5784            BR, FPU,
5785            ALU0, ALU1, ALU = ALU0 | ALU1 );
5786 
5787 //----------PIPELINE DESCRIPTION-----------------------------------------------
5788 // Pipeline Description specifies the stages in the machine's pipeline
5789 
5790 // Generic P2/P3 pipeline
5791 pipe_desc(S0, S1, S2, S3, S4, S5);
5792 
5793 //----------PIPELINE CLASSES---------------------------------------------------
5794 // Pipeline Classes describe the stages in which input and output are
5795 // referenced by the hardware pipeline.
5796 
5797 // Naming convention: ialu or fpu
5798 // Then: _reg
5799 // Then: _reg if there is a 2nd register
5800 // Then: _long if it's a pair of instructions implementing a long
5801 // Then: _fat if it requires the big decoder
5802 //   Or: _mem if it requires the big decoder and a memory unit.
5803 
5804 // Integer ALU reg operation
5805 pipe_class ialu_reg(eRegI dst) %{
5806     single_instruction;
5807     dst    : S4(write);
5808     dst    : S3(read);
5809     DECODE : S0;        // any decoder
5810     ALU    : S3;        // any alu
5811 %}
5812 
5813 // Long ALU reg operation
5814 pipe_class ialu_reg_long(eRegL dst) %{
5815     instruction_count(2);
5816     dst    : S4(write);
5817     dst    : S3(read);
5818     DECODE : S0(2);     // any 2 decoders
5819     ALU    : S3(2);     // both alus
5820 %}
5821 
5822 // Integer ALU reg operation using big decoder
5823 pipe_class ialu_reg_fat(eRegI dst) %{
5824     single_instruction;
5825     dst    : S4(write);
5826     dst    : S3(read);
5827     D0     : S0;        // big decoder only
5828     ALU    : S3;        // any alu
5829 %}
5830 
5831 // Long ALU reg operation using big decoder
5832 pipe_class ialu_reg_long_fat(eRegL dst) %{
5833     instruction_count(2);
5834     dst    : S4(write);
5835     dst    : S3(read);
5836     D0     : S0(2);     // big decoder only; twice
5837     ALU    : S3(2);     // any 2 alus
5838 %}
5839 
5840 // Integer ALU reg-reg operation
5841 pipe_class ialu_reg_reg(eRegI dst, eRegI src) %{
5842     single_instruction;
5843     dst    : S4(write);
5844     src    : S3(read);
5845     DECODE : S0;        // any decoder
5846     ALU    : S3;        // any alu
5847 %}
5848 
5849 // Long ALU reg-reg operation
5850 pipe_class ialu_reg_reg_long(eRegL dst, eRegL src) %{
5851     instruction_count(2);
5852     dst    : S4(write);
5853     src    : S3(read);
5854     DECODE : S0(2);     // any 2 decoders
5855     ALU    : S3(2);     // both alus
5856 %}
5857 
5858 // Integer ALU reg-reg operation
5859 pipe_class ialu_reg_reg_fat(eRegI dst, memory src) %{
5860     single_instruction;
5861     dst    : S4(write);
5862     src    : S3(read);
5863     D0     : S0;        // big decoder only
5864     ALU    : S3;        // any alu
5865 %}
5866 
5867 // Long ALU reg-reg operation
5868 pipe_class ialu_reg_reg_long_fat(eRegL dst, eRegL src) %{
5869     instruction_count(2);
5870     dst    : S4(write);
5871     src    : S3(read);
5872     D0     : S0(2);     // big decoder only; twice
5873     ALU    : S3(2);     // both alus
5874 %}
5875 
5876 // Integer ALU reg-mem operation
5877 pipe_class ialu_reg_mem(eRegI dst, memory mem) %{
5878     single_instruction;
5879     dst    : S5(write);
5880     mem    : S3(read);
5881     D0     : S0;        // big decoder only
5882     ALU    : S4;        // any alu
5883     MEM    : S3;        // any mem
5884 %}
5885 
5886 // Long ALU reg-mem operation
5887 pipe_class ialu_reg_long_mem(eRegL dst, load_long_memory mem) %{
5888     instruction_count(2);
5889     dst    : S5(write);
5890     mem    : S3(read);
5891     D0     : S0(2);     // big decoder only; twice
5892     ALU    : S4(2);     // any 2 alus
5893     MEM    : S3(2);     // both mems
5894 %}
5895 
5896 // Integer mem operation (prefetch)
5897 pipe_class ialu_mem(memory mem)
5898 %{
5899     single_instruction;
5900     mem    : S3(read);
5901     D0     : S0;        // big decoder only
5902     MEM    : S3;        // any mem
5903 %}
5904 
5905 // Integer Store to Memory
5906 pipe_class ialu_mem_reg(memory mem, eRegI src) %{
5907     single_instruction;
5908     mem    : S3(read);
5909     src    : S5(read);
5910     D0     : S0;        // big decoder only
5911     ALU    : S4;        // any alu
5912     MEM    : S3;
5913 %}
5914 
5915 // Long Store to Memory
5916 pipe_class ialu_mem_long_reg(memory mem, eRegL src) %{
5917     instruction_count(2);
5918     mem    : S3(read);
5919     src    : S5(read);
5920     D0     : S0(2);     // big decoder only; twice
5921     ALU    : S4(2);     // any 2 alus
5922     MEM    : S3(2);     // Both mems
5923 %}
5924 
5925 // Integer Store to Memory
5926 pipe_class ialu_mem_imm(memory mem) %{
5927     single_instruction;
5928     mem    : S3(read);
5929     D0     : S0;        // big decoder only
5930     ALU    : S4;        // any alu
5931     MEM    : S3;
5932 %}
5933 
5934 // Integer ALU0 reg-reg operation
5935 pipe_class ialu_reg_reg_alu0(eRegI dst, eRegI src) %{
5936     single_instruction;
5937     dst    : S4(write);
5938     src    : S3(read);
5939     D0     : S0;        // Big decoder only
5940     ALU0   : S3;        // only alu0
5941 %}
5942 
5943 // Integer ALU0 reg-mem operation
5944 pipe_class ialu_reg_mem_alu0(eRegI dst, memory mem) %{
5945     single_instruction;
5946     dst    : S5(write);
5947     mem    : S3(read);
5948     D0     : S0;        // big decoder only
5949     ALU0   : S4;        // ALU0 only
5950     MEM    : S3;        // any mem
5951 %}
5952 
5953 // Integer ALU reg-reg operation
5954 pipe_class ialu_cr_reg_reg(eFlagsReg cr, eRegI src1, eRegI src2) %{
5955     single_instruction;
5956     cr     : S4(write);
5957     src1   : S3(read);
5958     src2   : S3(read);
5959     DECODE : S0;        // any decoder
5960     ALU    : S3;        // any alu
5961 %}
5962 
5963 // Integer ALU reg-imm operation
5964 pipe_class ialu_cr_reg_imm(eFlagsReg cr, eRegI src1) %{
5965     single_instruction;
5966     cr     : S4(write);
5967     src1   : S3(read);
5968     DECODE : S0;        // any decoder
5969     ALU    : S3;        // any alu
5970 %}
5971 
5972 // Integer ALU reg-mem operation
5973 pipe_class ialu_cr_reg_mem(eFlagsReg cr, eRegI src1, memory src2) %{
5974     single_instruction;
5975     cr     : S4(write);
5976     src1   : S3(read);
5977     src2   : S3(read);
5978     D0     : S0;        // big decoder only
5979     ALU    : S4;        // any alu
5980     MEM    : S3;
5981 %}
5982 
5983 // Conditional move reg-reg
5984 pipe_class pipe_cmplt( eRegI p, eRegI q, eRegI y ) %{
5985     instruction_count(4);
5986     y      : S4(read);
5987     q      : S3(read);
5988     p      : S3(read);
5989     DECODE : S0(4);     // any decoder
5990 %}
5991 
5992 // Conditional move reg-reg
5993 pipe_class pipe_cmov_reg( eRegI dst, eRegI src, eFlagsReg cr ) %{
5994     single_instruction;
5995     dst    : S4(write);
5996     src    : S3(read);
5997     cr     : S3(read);
5998     DECODE : S0;        // any decoder
5999 %}
6000 
6001 // Conditional move reg-mem
6002 pipe_class pipe_cmov_mem( eFlagsReg cr, eRegI dst, memory src) %{
6003     single_instruction;
6004     dst    : S4(write);
6005     src    : S3(read);
6006     cr     : S3(read);
6007     DECODE : S0;        // any decoder
6008     MEM    : S3;
6009 %}
6010 
6011 // Conditional move reg-reg long
6012 pipe_class pipe_cmov_reg_long( eFlagsReg cr, eRegL dst, eRegL src) %{
6013     single_instruction;
6014     dst    : S4(write);
6015     src    : S3(read);
6016     cr     : S3(read);
6017     DECODE : S0(2);     // any 2 decoders
6018 %}
6019 
6020 // Conditional move double reg-reg
6021 pipe_class pipe_cmovD_reg( eFlagsReg cr, regDPR1 dst, regD src) %{
6022     single_instruction;
6023     dst    : S4(write);
6024     src    : S3(read);
6025     cr     : S3(read);
6026     DECODE : S0;        // any decoder
6027 %}
6028 
6029 // Float reg-reg operation
6030 pipe_class fpu_reg(regD dst) %{
6031     instruction_count(2);
6032     dst    : S3(read);
6033     DECODE : S0(2);     // any 2 decoders
6034     FPU    : S3;
6035 %}
6036 
6037 // Float reg-reg operation
6038 pipe_class fpu_reg_reg(regD dst, regD src) %{
6039     instruction_count(2);
6040     dst    : S4(write);
6041     src    : S3(read);
6042     DECODE : S0(2);     // any 2 decoders
6043     FPU    : S3;
6044 %}
6045 
6046 // Float reg-reg operation
6047 pipe_class fpu_reg_reg_reg(regD dst, regD src1, regD src2) %{
6048     instruction_count(3);
6049     dst    : S4(write);
6050     src1   : S3(read);
6051     src2   : S3(read);
6052     DECODE : S0(3);     // any 3 decoders
6053     FPU    : S3(2);
6054 %}
6055 
6056 // Float reg-reg operation
6057 pipe_class fpu_reg_reg_reg_reg(regD dst, regD src1, regD src2, regD src3) %{
6058     instruction_count(4);
6059     dst    : S4(write);
6060     src1   : S3(read);
6061     src2   : S3(read);
6062     src3   : S3(read);
6063     DECODE : S0(4);     // any 3 decoders
6064     FPU    : S3(2);
6065 %}
6066 
6067 // Float reg-reg operation
6068 pipe_class fpu_reg_mem_reg_reg(regD dst, memory src1, regD src2, regD src3) %{
6069     instruction_count(4);
6070     dst    : S4(write);
6071     src1   : S3(read);
6072     src2   : S3(read);
6073     src3   : S3(read);
6074     DECODE : S1(3);     // any 3 decoders
6075     D0     : S0;        // Big decoder only
6076     FPU    : S3(2);
6077     MEM    : S3;
6078 %}
6079 
6080 // Float reg-mem operation
6081 pipe_class fpu_reg_mem(regD dst, memory mem) %{
6082     instruction_count(2);
6083     dst    : S5(write);
6084     mem    : S3(read);
6085     D0     : S0;        // big decoder only
6086     DECODE : S1;        // any decoder for FPU POP
6087     FPU    : S4;
6088     MEM    : S3;        // any mem
6089 %}
6090 
6091 // Float reg-mem operation
6092 pipe_class fpu_reg_reg_mem(regD dst, regD src1, memory mem) %{
6093     instruction_count(3);
6094     dst    : S5(write);
6095     src1   : S3(read);
6096     mem    : S3(read);
6097     D0     : S0;        // big decoder only
6098     DECODE : S1(2);     // any decoder for FPU POP
6099     FPU    : S4;
6100     MEM    : S3;        // any mem
6101 %}
6102 
6103 // Float mem-reg operation
6104 pipe_class fpu_mem_reg(memory mem, regD src) %{
6105     instruction_count(2);
6106     src    : S5(read);
6107     mem    : S3(read);
6108     DECODE : S0;        // any decoder for FPU PUSH
6109     D0     : S1;        // big decoder only
6110     FPU    : S4;
6111     MEM    : S3;        // any mem
6112 %}
6113 
6114 pipe_class fpu_mem_reg_reg(memory mem, regD src1, regD src2) %{
6115     instruction_count(3);
6116     src1   : S3(read);
6117     src2   : S3(read);
6118     mem    : S3(read);
6119     DECODE : S0(2);     // any decoder for FPU PUSH
6120     D0     : S1;        // big decoder only
6121     FPU    : S4;
6122     MEM    : S3;        // any mem
6123 %}
6124 
6125 pipe_class fpu_mem_reg_mem(memory mem, regD src1, memory src2) %{
6126     instruction_count(3);
6127     src1   : S3(read);
6128     src2   : S3(read);
6129     mem    : S4(read);
6130     DECODE : S0;        // any decoder for FPU PUSH
6131     D0     : S0(2);     // big decoder only
6132     FPU    : S4;
6133     MEM    : S3(2);     // any mem
6134 %}
6135 
6136 pipe_class fpu_mem_mem(memory dst, memory src1) %{
6137     instruction_count(2);
6138     src1   : S3(read);
6139     dst    : S4(read);
6140     D0     : S0(2);     // big decoder only
6141     MEM    : S3(2);     // any mem
6142 %}
6143 
6144 pipe_class fpu_mem_mem_mem(memory dst, memory src1, memory src2) %{
6145     instruction_count(3);
6146     src1   : S3(read);
6147     src2   : S3(read);
6148     dst    : S4(read);
6149     D0     : S0(3);     // big decoder only
6150     FPU    : S4;
6151     MEM    : S3(3);     // any mem
6152 %}
6153 
6154 pipe_class fpu_mem_reg_con(memory mem, regD src1) %{
6155     instruction_count(3);
6156     src1   : S4(read);
6157     mem    : S4(read);
6158     DECODE : S0;        // any decoder for FPU PUSH
6159     D0     : S0(2);     // big decoder only
6160     FPU    : S4;
6161     MEM    : S3(2);     // any mem
6162 %}
6163 
6164 // Float load constant
6165 pipe_class fpu_reg_con(regD dst) %{
6166     instruction_count(2);
6167     dst    : S5(write);
6168     D0     : S0;        // big decoder only for the load
6169     DECODE : S1;        // any decoder for FPU POP
6170     FPU    : S4;
6171     MEM    : S3;        // any mem
6172 %}
6173 
6174 // Float load constant
6175 pipe_class fpu_reg_reg_con(regD dst, regD src) %{
6176     instruction_count(3);
6177     dst    : S5(write);
6178     src    : S3(read);
6179     D0     : S0;        // big decoder only for the load
6180     DECODE : S1(2);     // any decoder for FPU POP
6181     FPU    : S4;
6182     MEM    : S3;        // any mem
6183 %}
6184 
6185 // UnConditional branch
6186 pipe_class pipe_jmp( label labl ) %{
6187     single_instruction;
6188     BR   : S3;
6189 %}
6190 
6191 // Conditional branch
6192 pipe_class pipe_jcc( cmpOp cmp, eFlagsReg cr, label labl ) %{
6193     single_instruction;
6194     cr    : S1(read);
6195     BR    : S3;
6196 %}
6197 
6198 // Allocation idiom
6199 pipe_class pipe_cmpxchg( eRegP dst, eRegP heap_ptr ) %{
6200     instruction_count(1); force_serialization;
6201     fixed_latency(6);
6202     heap_ptr : S3(read);
6203     DECODE   : S0(3);
6204     D0       : S2;
6205     MEM      : S3;
6206     ALU      : S3(2);
6207     dst      : S5(write);
6208     BR       : S5;
6209 %}
6210 
6211 // Generic big/slow expanded idiom
6212 pipe_class pipe_slow(  ) %{
6213     instruction_count(10); multiple_bundles; force_serialization;
6214     fixed_latency(100);
6215     D0  : S0(2);
6216     MEM : S3(2);
6217 %}
6218 
6219 // The real do-nothing guy
6220 pipe_class empty( ) %{
6221     instruction_count(0);
6222 %}
6223 
6224 // Define the class for the Nop node
6225 define %{
6226    MachNop = empty;
6227 %}
6228 
6229 %}
6230 
6231 //----------INSTRUCTIONS-------------------------------------------------------
6232 //
6233 // match      -- States which machine-independent subtree may be replaced
6234 //               by this instruction.
6235 // ins_cost   -- The estimated cost of this instruction is used by instruction
6236 //               selection to identify a minimum cost tree of machine
6237 //               instructions that matches a tree of machine-independent
6238 //               instructions.
6239 // format     -- A string providing the disassembly for this instruction.
6240 //               The value of an instruction's operand may be inserted
6241 //               by referring to it with a '$' prefix.
6242 // opcode     -- Three instruction opcodes may be provided.  These are referred
6243 //               to within an encode class as $primary, $secondary, and $tertiary
6244 //               respectively.  The primary opcode is commonly used to
6245 //               indicate the type of machine instruction, while secondary
6246 //               and tertiary are often used for prefix options or addressing
6247 //               modes.
6248 // ins_encode -- A list of encode classes with parameters. The encode class
6249 //               name must have been defined in an 'enc_class' specification
6250 //               in the encode section of the architecture description.
6251 
6252 //----------BSWAP-Instruction--------------------------------------------------
6253 instruct bytes_reverse_int(eRegI dst) %{
6254   match(Set dst (ReverseBytesI dst));
6255 
6256   format %{ "BSWAP  $dst" %}
6257   opcode(0x0F, 0xC8);
6258   ins_encode( OpcP, OpcSReg(dst) );
6259   ins_pipe( ialu_reg );
6260 %}
6261 
6262 instruct bytes_reverse_long(eRegL dst) %{
6263   match(Set dst (ReverseBytesL dst));
6264 
6265   format %{ "BSWAP  $dst.lo\n\t"
6266             "BSWAP  $dst.hi\n\t"
6267             "XCHG   $dst.lo $dst.hi" %}
6268 
6269   ins_cost(125);
6270   ins_encode( bswap_long_bytes(dst) );
6271   ins_pipe( ialu_reg_reg);
6272 %}
6273 
6274 
6275 //---------- Zeros Count Instructions ------------------------------------------
6276 
6277 instruct countLeadingZerosI(eRegI dst, eRegI src, eFlagsReg cr) %{
6278   predicate(UseCountLeadingZerosInstruction);
6279   match(Set dst (CountLeadingZerosI src));
6280   effect(KILL cr);
6281 
6282   format %{ "LZCNT  $dst, $src\t# count leading zeros (int)" %}
6283   ins_encode %{
6284     __ lzcntl($dst$$Register, $src$$Register);
6285   %}
6286   ins_pipe(ialu_reg);
6287 %}
6288 
6289 instruct countLeadingZerosI_bsr(eRegI dst, eRegI src, eFlagsReg cr) %{
6290   predicate(!UseCountLeadingZerosInstruction);
6291   match(Set dst (CountLeadingZerosI src));
6292   effect(KILL cr);
6293 
6294   format %{ "BSR    $dst, $src\t# count leading zeros (int)\n\t"
6295             "JNZ    skip\n\t"
6296             "MOV    $dst, -1\n"
6297       "skip:\n\t"
6298             "NEG    $dst\n\t"
6299             "ADD    $dst, 31" %}
6300   ins_encode %{
6301     Register Rdst = $dst$$Register;
6302     Register Rsrc = $src$$Register;
6303     Label skip;
6304     __ bsrl(Rdst, Rsrc);
6305     __ jccb(Assembler::notZero, skip);
6306     __ movl(Rdst, -1);
6307     __ bind(skip);
6308     __ negl(Rdst);
6309     __ addl(Rdst, BitsPerInt - 1);
6310   %}
6311   ins_pipe(ialu_reg);
6312 %}
6313 
6314 instruct countLeadingZerosL(eRegI dst, eRegL src, eFlagsReg cr) %{
6315   predicate(UseCountLeadingZerosInstruction);
6316   match(Set dst (CountLeadingZerosL src));
6317   effect(TEMP dst, KILL cr);
6318 
6319   format %{ "LZCNT  $dst, $src.hi\t# count leading zeros (long)\n\t"
6320             "JNC    done\n\t"
6321             "LZCNT  $dst, $src.lo\n\t"
6322             "ADD    $dst, 32\n"
6323       "done:" %}
6324   ins_encode %{
6325     Register Rdst = $dst$$Register;
6326     Register Rsrc = $src$$Register;
6327     Label done;
6328     __ lzcntl(Rdst, HIGH_FROM_LOW(Rsrc));
6329     __ jccb(Assembler::carryClear, done);
6330     __ lzcntl(Rdst, Rsrc);
6331     __ addl(Rdst, BitsPerInt);
6332     __ bind(done);
6333   %}
6334   ins_pipe(ialu_reg);
6335 %}
6336 
6337 instruct countLeadingZerosL_bsr(eRegI dst, eRegL src, eFlagsReg cr) %{
6338   predicate(!UseCountLeadingZerosInstruction);
6339   match(Set dst (CountLeadingZerosL src));
6340   effect(TEMP dst, KILL cr);
6341 
6342   format %{ "BSR    $dst, $src.hi\t# count leading zeros (long)\n\t"
6343             "JZ     msw_is_zero\n\t"
6344             "ADD    $dst, 32\n\t"
6345             "JMP    not_zero\n"
6346       "msw_is_zero:\n\t"
6347             "BSR    $dst, $src.lo\n\t"
6348             "JNZ    not_zero\n\t"
6349             "MOV    $dst, -1\n"
6350       "not_zero:\n\t"
6351             "NEG    $dst\n\t"
6352             "ADD    $dst, 63\n" %}
6353  ins_encode %{
6354     Register Rdst = $dst$$Register;
6355     Register Rsrc = $src$$Register;
6356     Label msw_is_zero;
6357     Label not_zero;
6358     __ bsrl(Rdst, HIGH_FROM_LOW(Rsrc));
6359     __ jccb(Assembler::zero, msw_is_zero);
6360     __ addl(Rdst, BitsPerInt);
6361     __ jmpb(not_zero);
6362     __ bind(msw_is_zero);
6363     __ bsrl(Rdst, Rsrc);
6364     __ jccb(Assembler::notZero, not_zero);
6365     __ movl(Rdst, -1);
6366     __ bind(not_zero);
6367     __ negl(Rdst);
6368     __ addl(Rdst, BitsPerLong - 1);
6369   %}
6370   ins_pipe(ialu_reg);
6371 %}
6372 
6373 instruct countTrailingZerosI(eRegI dst, eRegI src, eFlagsReg cr) %{
6374   match(Set dst (CountTrailingZerosI src));
6375   effect(KILL cr);
6376 
6377   format %{ "BSF    $dst, $src\t# count trailing zeros (int)\n\t"
6378             "JNZ    done\n\t"
6379             "MOV    $dst, 32\n"
6380       "done:" %}
6381   ins_encode %{
6382     Register Rdst = $dst$$Register;
6383     Label done;
6384     __ bsfl(Rdst, $src$$Register);
6385     __ jccb(Assembler::notZero, done);
6386     __ movl(Rdst, BitsPerInt);
6387     __ bind(done);
6388   %}
6389   ins_pipe(ialu_reg);
6390 %}
6391 
6392 instruct countTrailingZerosL(eRegI dst, eRegL src, eFlagsReg cr) %{
6393   match(Set dst (CountTrailingZerosL src));
6394   effect(TEMP dst, KILL cr);
6395 
6396   format %{ "BSF    $dst, $src.lo\t# count trailing zeros (long)\n\t"
6397             "JNZ    done\n\t"
6398             "BSF    $dst, $src.hi\n\t"
6399             "JNZ    msw_not_zero\n\t"
6400             "MOV    $dst, 32\n"
6401       "msw_not_zero:\n\t"
6402             "ADD    $dst, 32\n"
6403       "done:" %}
6404   ins_encode %{
6405     Register Rdst = $dst$$Register;
6406     Register Rsrc = $src$$Register;
6407     Label msw_not_zero;
6408     Label done;
6409     __ bsfl(Rdst, Rsrc);
6410     __ jccb(Assembler::notZero, done);
6411     __ bsfl(Rdst, HIGH_FROM_LOW(Rsrc));
6412     __ jccb(Assembler::notZero, msw_not_zero);
6413     __ movl(Rdst, BitsPerInt);
6414     __ bind(msw_not_zero);
6415     __ addl(Rdst, BitsPerInt);
6416     __ bind(done);
6417   %}
6418   ins_pipe(ialu_reg);
6419 %}
6420 
6421 
6422 //---------- Population Count Instructions -------------------------------------
6423 
6424 instruct popCountI(eRegI dst, eRegI src) %{
6425   predicate(UsePopCountInstruction);
6426   match(Set dst (PopCountI src));
6427 
6428   format %{ "POPCNT $dst, $src" %}
6429   ins_encode %{
6430     __ popcntl($dst$$Register, $src$$Register);
6431   %}
6432   ins_pipe(ialu_reg);
6433 %}
6434 
6435 instruct popCountI_mem(eRegI dst, memory mem) %{
6436   predicate(UsePopCountInstruction);
6437   match(Set dst (PopCountI (LoadI mem)));
6438 
6439   format %{ "POPCNT $dst, $mem" %}
6440   ins_encode %{
6441     __ popcntl($dst$$Register, $mem$$Address);
6442   %}
6443   ins_pipe(ialu_reg);
6444 %}
6445 
6446 // Note: Long.bitCount(long) returns an int.
6447 instruct popCountL(eRegI dst, eRegL src, eRegI tmp, eFlagsReg cr) %{
6448   predicate(UsePopCountInstruction);
6449   match(Set dst (PopCountL src));
6450   effect(KILL cr, TEMP tmp, TEMP dst);
6451 
6452   format %{ "POPCNT $dst, $src.lo\n\t"
6453             "POPCNT $tmp, $src.hi\n\t"
6454             "ADD    $dst, $tmp" %}
6455   ins_encode %{
6456     __ popcntl($dst$$Register, $src$$Register);
6457     __ popcntl($tmp$$Register, HIGH_FROM_LOW($src$$Register));
6458     __ addl($dst$$Register, $tmp$$Register);
6459   %}
6460   ins_pipe(ialu_reg);
6461 %}
6462 
6463 // Note: Long.bitCount(long) returns an int.
6464 instruct popCountL_mem(eRegI dst, memory mem, eRegI tmp, eFlagsReg cr) %{
6465   predicate(UsePopCountInstruction);
6466   match(Set dst (PopCountL (LoadL mem)));
6467   effect(KILL cr, TEMP tmp, TEMP dst);
6468 
6469   format %{ "POPCNT $dst, $mem\n\t"
6470             "POPCNT $tmp, $mem+4\n\t"
6471             "ADD    $dst, $tmp" %}
6472   ins_encode %{
6473     //__ popcntl($dst$$Register, $mem$$Address$$first);
6474     //__ popcntl($tmp$$Register, $mem$$Address$$second);
6475     __ popcntl($dst$$Register, Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp, false));
6476     __ popcntl($tmp$$Register, Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp + 4, false));
6477     __ addl($dst$$Register, $tmp$$Register);
6478   %}
6479   ins_pipe(ialu_reg);
6480 %}
6481 
6482 
6483 //----------Load/Store/Move Instructions---------------------------------------
6484 //----------Load Instructions--------------------------------------------------
6485 // Load Byte (8bit signed)
6486 instruct loadB(xRegI dst, memory mem) %{
6487   match(Set dst (LoadB mem));
6488 
6489   ins_cost(125);
6490   format %{ "MOVSX8 $dst,$mem\t# byte" %}
6491 
6492   ins_encode %{
6493     __ movsbl($dst$$Register, $mem$$Address);
6494   %}
6495 
6496   ins_pipe(ialu_reg_mem);
6497 %}
6498 
6499 // Load Byte (8bit signed) into Long Register
6500 instruct loadB2L(eRegL dst, memory mem, eFlagsReg cr) %{
6501   match(Set dst (ConvI2L (LoadB mem)));
6502   effect(KILL cr);
6503 
6504   ins_cost(375);
6505   format %{ "MOVSX8 $dst.lo,$mem\t# byte -> long\n\t"
6506             "MOV    $dst.hi,$dst.lo\n\t"
6507             "SAR    $dst.hi,7" %}
6508 
6509   ins_encode %{
6510     __ movsbl($dst$$Register, $mem$$Address);
6511     __ movl(HIGH_FROM_LOW($dst$$Register), $dst$$Register); // This is always a different register.
6512     __ sarl(HIGH_FROM_LOW($dst$$Register), 7); // 24+1 MSB are already signed extended.
6513   %}
6514 
6515   ins_pipe(ialu_reg_mem);
6516 %}
6517 
6518 // Load Unsigned Byte (8bit UNsigned)
6519 instruct loadUB(xRegI dst, memory mem) %{
6520   match(Set dst (LoadUB mem));
6521 
6522   ins_cost(125);
6523   format %{ "MOVZX8 $dst,$mem\t# ubyte -> int" %}
6524 
6525   ins_encode %{
6526     __ movzbl($dst$$Register, $mem$$Address);
6527   %}
6528 
6529   ins_pipe(ialu_reg_mem);
6530 %}
6531 
6532 // Load Unsigned Byte (8 bit UNsigned) into Long Register
6533 instruct loadUB2L(eRegL dst, memory mem, eFlagsReg cr) %{
6534   match(Set dst (ConvI2L (LoadUB mem)));
6535   effect(KILL cr);
6536 
6537   ins_cost(250);
6538   format %{ "MOVZX8 $dst.lo,$mem\t# ubyte -> long\n\t"
6539             "XOR    $dst.hi,$dst.hi" %}
6540 
6541   ins_encode %{
6542     Register Rdst = $dst$$Register;
6543     __ movzbl(Rdst, $mem$$Address);
6544     __ xorl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rdst));
6545   %}
6546 
6547   ins_pipe(ialu_reg_mem);
6548 %}
6549 
6550 // Load Unsigned Byte (8 bit UNsigned) with mask into Long Register
6551 instruct loadUB2L_immI8(eRegL dst, memory mem, immI8 mask, eFlagsReg cr) %{
6552   match(Set dst (ConvI2L (AndI (LoadUB mem) mask)));
6553   effect(KILL cr);
6554 
6555   format %{ "MOVZX8 $dst.lo,$mem\t# ubyte & 8-bit mask -> long\n\t"
6556             "XOR    $dst.hi,$dst.hi\n\t"
6557             "AND    $dst.lo,$mask" %}
6558   ins_encode %{
6559     Register Rdst = $dst$$Register;
6560     __ movzbl(Rdst, $mem$$Address);
6561     __ xorl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rdst));
6562     __ andl(Rdst, $mask$$constant);
6563   %}
6564   ins_pipe(ialu_reg_mem);
6565 %}
6566 
6567 // Load Short (16bit signed)
6568 instruct loadS(eRegI dst, memory mem) %{
6569   match(Set dst (LoadS mem));
6570 
6571   ins_cost(125);
6572   format %{ "MOVSX  $dst,$mem\t# short" %}
6573 
6574   ins_encode %{
6575     __ movswl($dst$$Register, $mem$$Address);
6576   %}
6577 
6578   ins_pipe(ialu_reg_mem);
6579 %}
6580 
6581 // Load Short (16 bit signed) to Byte (8 bit signed)
6582 instruct loadS2B(eRegI dst, memory mem, immI_24 twentyfour) %{
6583   match(Set dst (RShiftI (LShiftI (LoadS mem) twentyfour) twentyfour));
6584 
6585   ins_cost(125);
6586   format %{ "MOVSX  $dst, $mem\t# short -> byte" %}
6587   ins_encode %{
6588     __ movsbl($dst$$Register, $mem$$Address);
6589   %}
6590   ins_pipe(ialu_reg_mem);
6591 %}
6592 
6593 // Load Short (16bit signed) into Long Register
6594 instruct loadS2L(eRegL dst, memory mem, eFlagsReg cr) %{
6595   match(Set dst (ConvI2L (LoadS mem)));
6596   effect(KILL cr);
6597 
6598   ins_cost(375);
6599   format %{ "MOVSX  $dst.lo,$mem\t# short -> long\n\t"
6600             "MOV    $dst.hi,$dst.lo\n\t"
6601             "SAR    $dst.hi,15" %}
6602 
6603   ins_encode %{
6604     __ movswl($dst$$Register, $mem$$Address);
6605     __ movl(HIGH_FROM_LOW($dst$$Register), $dst$$Register); // This is always a different register.
6606     __ sarl(HIGH_FROM_LOW($dst$$Register), 15); // 16+1 MSB are already signed extended.
6607   %}
6608 
6609   ins_pipe(ialu_reg_mem);
6610 %}
6611 
6612 // Load Unsigned Short/Char (16bit unsigned)
6613 instruct loadUS(eRegI dst, memory mem) %{
6614   match(Set dst (LoadUS mem));
6615 
6616   ins_cost(125);
6617   format %{ "MOVZX  $dst,$mem\t# ushort/char -> int" %}
6618 
6619   ins_encode %{
6620     __ movzwl($dst$$Register, $mem$$Address);
6621   %}
6622 
6623   ins_pipe(ialu_reg_mem);
6624 %}
6625 
6626 // Load Unsigned Short/Char (16 bit UNsigned) to Byte (8 bit signed)
6627 instruct loadUS2B(eRegI dst, memory mem, immI_24 twentyfour) %{
6628   match(Set dst (RShiftI (LShiftI (LoadUS mem) twentyfour) twentyfour));
6629 
6630   ins_cost(125);
6631   format %{ "MOVSX  $dst, $mem\t# ushort -> byte" %}
6632   ins_encode %{
6633     __ movsbl($dst$$Register, $mem$$Address);
6634   %}
6635   ins_pipe(ialu_reg_mem);
6636 %}
6637 
6638 // Load Unsigned Short/Char (16 bit UNsigned) into Long Register
6639 instruct loadUS2L(eRegL dst, memory mem, eFlagsReg cr) %{
6640   match(Set dst (ConvI2L (LoadUS mem)));
6641   effect(KILL cr);
6642 
6643   ins_cost(250);
6644   format %{ "MOVZX  $dst.lo,$mem\t# ushort/char -> long\n\t"
6645             "XOR    $dst.hi,$dst.hi" %}
6646 
6647   ins_encode %{
6648     __ movzwl($dst$$Register, $mem$$Address);
6649     __ xorl(HIGH_FROM_LOW($dst$$Register), HIGH_FROM_LOW($dst$$Register));
6650   %}
6651 
6652   ins_pipe(ialu_reg_mem);
6653 %}
6654 
6655 // Load Unsigned Short/Char (16 bit UNsigned) with mask 0xFF into Long Register
6656 instruct loadUS2L_immI_255(eRegL dst, memory mem, immI_255 mask, eFlagsReg cr) %{
6657   match(Set dst (ConvI2L (AndI (LoadUS mem) mask)));
6658   effect(KILL cr);
6659 
6660   format %{ "MOVZX8 $dst.lo,$mem\t# ushort/char & 0xFF -> long\n\t"
6661             "XOR    $dst.hi,$dst.hi" %}
6662   ins_encode %{
6663     Register Rdst = $dst$$Register;
6664     __ movzbl(Rdst, $mem$$Address);
6665     __ xorl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rdst));
6666   %}
6667   ins_pipe(ialu_reg_mem);
6668 %}
6669 
6670 // Load Unsigned Short/Char (16 bit UNsigned) with a 16-bit mask into Long Register
6671 instruct loadUS2L_immI16(eRegL dst, memory mem, immI16 mask, eFlagsReg cr) %{
6672   match(Set dst (ConvI2L (AndI (LoadUS mem) mask)));
6673   effect(KILL cr);
6674 
6675   format %{ "MOVZX  $dst.lo, $mem\t# ushort/char & 16-bit mask -> long\n\t"
6676             "XOR    $dst.hi,$dst.hi\n\t"
6677             "AND    $dst.lo,$mask" %}
6678   ins_encode %{
6679     Register Rdst = $dst$$Register;
6680     __ movzwl(Rdst, $mem$$Address);
6681     __ xorl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rdst));
6682     __ andl(Rdst, $mask$$constant);
6683   %}
6684   ins_pipe(ialu_reg_mem);
6685 %}
6686 
6687 // Load Integer
6688 instruct loadI(eRegI dst, memory mem) %{
6689   match(Set dst (LoadI mem));
6690 
6691   ins_cost(125);
6692   format %{ "MOV    $dst,$mem\t# int" %}
6693 
6694   ins_encode %{
6695     __ movl($dst$$Register, $mem$$Address);
6696   %}
6697 
6698   ins_pipe(ialu_reg_mem);
6699 %}
6700 
6701 // Load Integer (32 bit signed) to Byte (8 bit signed)
6702 instruct loadI2B(eRegI dst, memory mem, immI_24 twentyfour) %{
6703   match(Set dst (RShiftI (LShiftI (LoadI mem) twentyfour) twentyfour));
6704 
6705   ins_cost(125);
6706   format %{ "MOVSX  $dst, $mem\t# int -> byte" %}
6707   ins_encode %{
6708     __ movsbl($dst$$Register, $mem$$Address);
6709   %}
6710   ins_pipe(ialu_reg_mem);
6711 %}
6712 
6713 // Load Integer (32 bit signed) to Unsigned Byte (8 bit UNsigned)
6714 instruct loadI2UB(eRegI dst, memory mem, immI_255 mask) %{
6715   match(Set dst (AndI (LoadI mem) mask));
6716 
6717   ins_cost(125);
6718   format %{ "MOVZX  $dst, $mem\t# int -> ubyte" %}
6719   ins_encode %{
6720     __ movzbl($dst$$Register, $mem$$Address);
6721   %}
6722   ins_pipe(ialu_reg_mem);
6723 %}
6724 
6725 // Load Integer (32 bit signed) to Short (16 bit signed)
6726 instruct loadI2S(eRegI dst, memory mem, immI_16 sixteen) %{
6727   match(Set dst (RShiftI (LShiftI (LoadI mem) sixteen) sixteen));
6728 
6729   ins_cost(125);
6730   format %{ "MOVSX  $dst, $mem\t# int -> short" %}
6731   ins_encode %{
6732     __ movswl($dst$$Register, $mem$$Address);
6733   %}
6734   ins_pipe(ialu_reg_mem);
6735 %}
6736 
6737 // Load Integer (32 bit signed) to Unsigned Short/Char (16 bit UNsigned)
6738 instruct loadI2US(eRegI dst, memory mem, immI_65535 mask) %{
6739   match(Set dst (AndI (LoadI mem) mask));
6740 
6741   ins_cost(125);
6742   format %{ "MOVZX  $dst, $mem\t# int -> ushort/char" %}
6743   ins_encode %{
6744     __ movzwl($dst$$Register, $mem$$Address);
6745   %}
6746   ins_pipe(ialu_reg_mem);
6747 %}
6748 
6749 // Load Integer into Long Register
6750 instruct loadI2L(eRegL dst, memory mem, eFlagsReg cr) %{
6751   match(Set dst (ConvI2L (LoadI mem)));
6752   effect(KILL cr);
6753 
6754   ins_cost(375);
6755   format %{ "MOV    $dst.lo,$mem\t# int -> long\n\t"
6756             "MOV    $dst.hi,$dst.lo\n\t"
6757             "SAR    $dst.hi,31" %}
6758 
6759   ins_encode %{
6760     __ movl($dst$$Register, $mem$$Address);
6761     __ movl(HIGH_FROM_LOW($dst$$Register), $dst$$Register); // This is always a different register.
6762     __ sarl(HIGH_FROM_LOW($dst$$Register), 31);
6763   %}
6764 
6765   ins_pipe(ialu_reg_mem);
6766 %}
6767 
6768 // Load Integer with mask 0xFF into Long Register
6769 instruct loadI2L_immI_255(eRegL dst, memory mem, immI_255 mask, eFlagsReg cr) %{
6770   match(Set dst (ConvI2L (AndI (LoadI mem) mask)));
6771   effect(KILL cr);
6772 
6773   format %{ "MOVZX8 $dst.lo,$mem\t# int & 0xFF -> long\n\t"
6774             "XOR    $dst.hi,$dst.hi" %}
6775   ins_encode %{
6776     Register Rdst = $dst$$Register;
6777     __ movzbl(Rdst, $mem$$Address);
6778     __ xorl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rdst));
6779   %}
6780   ins_pipe(ialu_reg_mem);
6781 %}
6782 
6783 // Load Integer with mask 0xFFFF into Long Register
6784 instruct loadI2L_immI_65535(eRegL dst, memory mem, immI_65535 mask, eFlagsReg cr) %{
6785   match(Set dst (ConvI2L (AndI (LoadI mem) mask)));
6786   effect(KILL cr);
6787 
6788   format %{ "MOVZX  $dst.lo,$mem\t# int & 0xFFFF -> long\n\t"
6789             "XOR    $dst.hi,$dst.hi" %}
6790   ins_encode %{
6791     Register Rdst = $dst$$Register;
6792     __ movzwl(Rdst, $mem$$Address);
6793     __ xorl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rdst));
6794   %}
6795   ins_pipe(ialu_reg_mem);
6796 %}
6797 
6798 // Load Integer with 32-bit mask into Long Register
6799 instruct loadI2L_immI(eRegL dst, memory mem, immI mask, eFlagsReg cr) %{
6800   match(Set dst (ConvI2L (AndI (LoadI mem) mask)));
6801   effect(KILL cr);
6802 
6803   format %{ "MOV    $dst.lo,$mem\t# int & 32-bit mask -> long\n\t"
6804             "XOR    $dst.hi,$dst.hi\n\t"
6805             "AND    $dst.lo,$mask" %}
6806   ins_encode %{
6807     Register Rdst = $dst$$Register;
6808     __ movl(Rdst, $mem$$Address);
6809     __ xorl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rdst));
6810     __ andl(Rdst, $mask$$constant);
6811   %}
6812   ins_pipe(ialu_reg_mem);
6813 %}
6814 
6815 // Load Unsigned Integer into Long Register
6816 instruct loadUI2L(eRegL dst, memory mem, eFlagsReg cr) %{
6817   match(Set dst (LoadUI2L mem));
6818   effect(KILL cr);
6819 
6820   ins_cost(250);
6821   format %{ "MOV    $dst.lo,$mem\t# uint -> long\n\t"
6822             "XOR    $dst.hi,$dst.hi" %}
6823 
6824   ins_encode %{
6825     __ movl($dst$$Register, $mem$$Address);
6826     __ xorl(HIGH_FROM_LOW($dst$$Register), HIGH_FROM_LOW($dst$$Register));
6827   %}
6828 
6829   ins_pipe(ialu_reg_mem);
6830 %}
6831 
6832 // Load Long.  Cannot clobber address while loading, so restrict address
6833 // register to ESI
6834 instruct loadL(eRegL dst, load_long_memory mem) %{
6835   predicate(!((LoadLNode*)n)->require_atomic_access());
6836   match(Set dst (LoadL mem));
6837 
6838   ins_cost(250);
6839   format %{ "MOV    $dst.lo,$mem\t# long\n\t"
6840             "MOV    $dst.hi,$mem+4" %}
6841 
6842   ins_encode %{
6843     Address Amemlo = Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp, false);
6844     Address Amemhi = Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp + 4, false);
6845     __ movl($dst$$Register, Amemlo);
6846     __ movl(HIGH_FROM_LOW($dst$$Register), Amemhi);
6847   %}
6848 
6849   ins_pipe(ialu_reg_long_mem);
6850 %}
6851 
6852 // Volatile Load Long.  Must be atomic, so do 64-bit FILD
6853 // then store it down to the stack and reload on the int
6854 // side.
6855 instruct loadL_volatile(stackSlotL dst, memory mem) %{
6856   predicate(UseSSE<=1 && ((LoadLNode*)n)->require_atomic_access());
6857   match(Set dst (LoadL mem));
6858 
6859   ins_cost(200);
6860   format %{ "FILD   $mem\t# Atomic volatile long load\n\t"
6861             "FISTp  $dst" %}
6862   ins_encode(enc_loadL_volatile(mem,dst));
6863   ins_pipe( fpu_reg_mem );
6864 %}
6865 
6866 instruct loadLX_volatile(stackSlotL dst, memory mem, regXD tmp) %{
6867   predicate(UseSSE>=2 && ((LoadLNode*)n)->require_atomic_access());
6868   match(Set dst (LoadL mem));
6869   effect(TEMP tmp);
6870   ins_cost(180);
6871   format %{ "MOVSD  $tmp,$mem\t# Atomic volatile long load\n\t"
6872             "MOVSD  $dst,$tmp" %}
6873   ins_encode(enc_loadLX_volatile(mem, dst, tmp));
6874   ins_pipe( pipe_slow );
6875 %}
6876 
6877 instruct loadLX_reg_volatile(eRegL dst, memory mem, regXD tmp) %{
6878   predicate(UseSSE>=2 && ((LoadLNode*)n)->require_atomic_access());
6879   match(Set dst (LoadL mem));
6880   effect(TEMP tmp);
6881   ins_cost(160);
6882   format %{ "MOVSD  $tmp,$mem\t# Atomic volatile long load\n\t"
6883             "MOVD   $dst.lo,$tmp\n\t"
6884             "PSRLQ  $tmp,32\n\t"
6885             "MOVD   $dst.hi,$tmp" %}
6886   ins_encode(enc_loadLX_reg_volatile(mem, dst, tmp));
6887   ins_pipe( pipe_slow );
6888 %}
6889 
6890 // Load Range
6891 instruct loadRange(eRegI dst, memory mem) %{
6892   match(Set dst (LoadRange mem));
6893 
6894   ins_cost(125);
6895   format %{ "MOV    $dst,$mem" %}
6896   opcode(0x8B);
6897   ins_encode( OpcP, RegMem(dst,mem));
6898   ins_pipe( ialu_reg_mem );
6899 %}
6900 
6901 
6902 // Load Pointer
6903 instruct loadP(eRegP dst, memory mem) %{
6904   match(Set dst (LoadP mem));
6905 
6906   ins_cost(125);
6907   format %{ "MOV    $dst,$mem" %}
6908   opcode(0x8B);
6909   ins_encode( OpcP, RegMem(dst,mem));
6910   ins_pipe( ialu_reg_mem );
6911 %}
6912 
6913 // Load Klass Pointer
6914 instruct loadKlass(eRegP dst, memory mem) %{
6915   match(Set dst (LoadKlass mem));
6916 
6917   ins_cost(125);
6918   format %{ "MOV    $dst,$mem" %}
6919   opcode(0x8B);
6920   ins_encode( OpcP, RegMem(dst,mem));
6921   ins_pipe( ialu_reg_mem );
6922 %}
6923 
6924 // Load Double
6925 instruct loadD(regD dst, memory mem) %{
6926   predicate(UseSSE<=1);
6927   match(Set dst (LoadD mem));
6928 
6929   ins_cost(150);
6930   format %{ "FLD_D  ST,$mem\n\t"
6931             "FSTP   $dst" %}
6932   opcode(0xDD);               /* DD /0 */
6933   ins_encode( OpcP, RMopc_Mem(0x00,mem),
6934               Pop_Reg_D(dst) );
6935   ins_pipe( fpu_reg_mem );
6936 %}
6937 
6938 // Load Double to XMM
6939 instruct loadXD(regXD dst, memory mem) %{
6940   predicate(UseSSE>=2 && UseXmmLoadAndClearUpper);
6941   match(Set dst (LoadD mem));
6942   ins_cost(145);
6943   format %{ "MOVSD  $dst,$mem" %}
6944   ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x10), RegMem(dst,mem));
6945   ins_pipe( pipe_slow );
6946 %}
6947 
6948 instruct loadXD_partial(regXD dst, memory mem) %{
6949   predicate(UseSSE>=2 && !UseXmmLoadAndClearUpper);
6950   match(Set dst (LoadD mem));
6951   ins_cost(145);
6952   format %{ "MOVLPD $dst,$mem" %}
6953   ins_encode( Opcode(0x66), Opcode(0x0F), Opcode(0x12), RegMem(dst,mem));
6954   ins_pipe( pipe_slow );
6955 %}
6956 
6957 // Load to XMM register (single-precision floating point)
6958 // MOVSS instruction
6959 instruct loadX(regX dst, memory mem) %{
6960   predicate(UseSSE>=1);
6961   match(Set dst (LoadF mem));
6962   ins_cost(145);
6963   format %{ "MOVSS  $dst,$mem" %}
6964   ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x10), RegMem(dst,mem));
6965   ins_pipe( pipe_slow );
6966 %}
6967 
6968 // Load Float
6969 instruct loadF(regF dst, memory mem) %{
6970   predicate(UseSSE==0);
6971   match(Set dst (LoadF mem));
6972 
6973   ins_cost(150);
6974   format %{ "FLD_S  ST,$mem\n\t"
6975             "FSTP   $dst" %}
6976   opcode(0xD9);               /* D9 /0 */
6977   ins_encode( OpcP, RMopc_Mem(0x00,mem),
6978               Pop_Reg_F(dst) );
6979   ins_pipe( fpu_reg_mem );
6980 %}
6981 
6982 // Load Aligned Packed Byte to XMM register
6983 instruct loadA8B(regXD dst, memory mem) %{
6984   predicate(UseSSE>=1);
6985   match(Set dst (Load8B mem));
6986   ins_cost(125);
6987   format %{ "MOVQ  $dst,$mem\t! packed8B" %}
6988   ins_encode( movq_ld(dst, mem));
6989   ins_pipe( pipe_slow );
6990 %}
6991 
6992 // Load Aligned Packed Short to XMM register
6993 instruct loadA4S(regXD dst, memory mem) %{
6994   predicate(UseSSE>=1);
6995   match(Set dst (Load4S mem));
6996   ins_cost(125);
6997   format %{ "MOVQ  $dst,$mem\t! packed4S" %}
6998   ins_encode( movq_ld(dst, mem));
6999   ins_pipe( pipe_slow );
7000 %}
7001 
7002 // Load Aligned Packed Char to XMM register
7003 instruct loadA4C(regXD dst, memory mem) %{
7004   predicate(UseSSE>=1);
7005   match(Set dst (Load4C mem));
7006   ins_cost(125);
7007   format %{ "MOVQ  $dst,$mem\t! packed4C" %}
7008   ins_encode( movq_ld(dst, mem));
7009   ins_pipe( pipe_slow );
7010 %}
7011 
7012 // Load Aligned Packed Integer to XMM register
7013 instruct load2IU(regXD dst, memory mem) %{
7014   predicate(UseSSE>=1);
7015   match(Set dst (Load2I mem));
7016   ins_cost(125);
7017   format %{ "MOVQ  $dst,$mem\t! packed2I" %}
7018   ins_encode( movq_ld(dst, mem));
7019   ins_pipe( pipe_slow );
7020 %}
7021 
7022 // Load Aligned Packed Single to XMM
7023 instruct loadA2F(regXD dst, memory mem) %{
7024   predicate(UseSSE>=1);
7025   match(Set dst (Load2F mem));
7026   ins_cost(145);
7027   format %{ "MOVQ  $dst,$mem\t! packed2F" %}
7028   ins_encode( movq_ld(dst, mem));
7029   ins_pipe( pipe_slow );
7030 %}
7031 
7032 // Load Effective Address
7033 instruct leaP8(eRegP dst, indOffset8 mem) %{
7034   match(Set dst mem);
7035 
7036   ins_cost(110);
7037   format %{ "LEA    $dst,$mem" %}
7038   opcode(0x8D);
7039   ins_encode( OpcP, RegMem(dst,mem));
7040   ins_pipe( ialu_reg_reg_fat );
7041 %}
7042 
7043 instruct leaP32(eRegP dst, indOffset32 mem) %{
7044   match(Set dst mem);
7045 
7046   ins_cost(110);
7047   format %{ "LEA    $dst,$mem" %}
7048   opcode(0x8D);
7049   ins_encode( OpcP, RegMem(dst,mem));
7050   ins_pipe( ialu_reg_reg_fat );
7051 %}
7052 
7053 instruct leaPIdxOff(eRegP dst, indIndexOffset mem) %{
7054   match(Set dst mem);
7055 
7056   ins_cost(110);
7057   format %{ "LEA    $dst,$mem" %}
7058   opcode(0x8D);
7059   ins_encode( OpcP, RegMem(dst,mem));
7060   ins_pipe( ialu_reg_reg_fat );
7061 %}
7062 
7063 instruct leaPIdxScale(eRegP dst, indIndexScale mem) %{
7064   match(Set dst mem);
7065 
7066   ins_cost(110);
7067   format %{ "LEA    $dst,$mem" %}
7068   opcode(0x8D);
7069   ins_encode( OpcP, RegMem(dst,mem));
7070   ins_pipe( ialu_reg_reg_fat );
7071 %}
7072 
7073 instruct leaPIdxScaleOff(eRegP dst, indIndexScaleOffset mem) %{
7074   match(Set dst mem);
7075 
7076   ins_cost(110);
7077   format %{ "LEA    $dst,$mem" %}
7078   opcode(0x8D);
7079   ins_encode( OpcP, RegMem(dst,mem));
7080   ins_pipe( ialu_reg_reg_fat );
7081 %}
7082 
7083 // Load Constant
7084 instruct loadConI(eRegI dst, immI src) %{
7085   match(Set dst src);
7086 
7087   format %{ "MOV    $dst,$src" %}
7088   ins_encode( LdImmI(dst, src) );
7089   ins_pipe( ialu_reg_fat );
7090 %}
7091 
7092 // Load Constant zero
7093 instruct loadConI0(eRegI dst, immI0 src, eFlagsReg cr) %{
7094   match(Set dst src);
7095   effect(KILL cr);
7096 
7097   ins_cost(50);
7098   format %{ "XOR    $dst,$dst" %}
7099   opcode(0x33);  /* + rd */
7100   ins_encode( OpcP, RegReg( dst, dst ) );
7101   ins_pipe( ialu_reg );
7102 %}
7103 
7104 instruct loadConP(eRegP dst, immP src) %{
7105   match(Set dst src);
7106 
7107   format %{ "MOV    $dst,$src" %}
7108   opcode(0xB8);  /* + rd */
7109   ins_encode( LdImmP(dst, src) );
7110   ins_pipe( ialu_reg_fat );
7111 %}
7112 
7113 instruct loadConL(eRegL dst, immL src, eFlagsReg cr) %{
7114   match(Set dst src);
7115   effect(KILL cr);
7116   ins_cost(200);
7117   format %{ "MOV    $dst.lo,$src.lo\n\t"
7118             "MOV    $dst.hi,$src.hi" %}
7119   opcode(0xB8);
7120   ins_encode( LdImmL_Lo(dst, src), LdImmL_Hi(dst, src) );
7121   ins_pipe( ialu_reg_long_fat );
7122 %}
7123 
7124 instruct loadConL0(eRegL dst, immL0 src, eFlagsReg cr) %{
7125   match(Set dst src);
7126   effect(KILL cr);
7127   ins_cost(150);
7128   format %{ "XOR    $dst.lo,$dst.lo\n\t"
7129             "XOR    $dst.hi,$dst.hi" %}
7130   opcode(0x33,0x33);
7131   ins_encode( RegReg_Lo(dst,dst), RegReg_Hi(dst, dst) );
7132   ins_pipe( ialu_reg_long );
7133 %}
7134 
7135 // The instruction usage is guarded by predicate in operand immF().
7136 instruct loadConF(regF dst, immF src) %{
7137   match(Set dst src);
7138   ins_cost(125);
7139 
7140   format %{ "FLD_S  ST,$src\n\t"
7141             "FSTP   $dst" %}
7142   opcode(0xD9, 0x00);       /* D9 /0 */
7143   ins_encode(LdImmF(src), Pop_Reg_F(dst) );
7144   ins_pipe( fpu_reg_con );
7145 %}
7146 
7147 // The instruction usage is guarded by predicate in operand immXF().
7148 instruct loadConX(regX dst, immXF con) %{
7149   match(Set dst con);
7150   ins_cost(125);
7151   format %{ "MOVSS  $dst,[$con]" %}
7152   ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x10), LdImmX(dst, con));
7153   ins_pipe( pipe_slow );
7154 %}
7155 
7156 // The instruction usage is guarded by predicate in operand immXF0().
7157 instruct loadConX0(regX dst, immXF0 src) %{
7158   match(Set dst src);
7159   ins_cost(100);
7160   format %{ "XORPS  $dst,$dst\t# float 0.0" %}
7161   ins_encode( Opcode(0x0F), Opcode(0x57), RegReg(dst,dst));
7162   ins_pipe( pipe_slow );
7163 %}
7164 
7165 // The instruction usage is guarded by predicate in operand immD().
7166 instruct loadConD(regD dst, immD src) %{
7167   match(Set dst src);
7168   ins_cost(125);
7169 
7170   format %{ "FLD_D  ST,$src\n\t"
7171             "FSTP   $dst" %}
7172   ins_encode(LdImmD(src), Pop_Reg_D(dst) );
7173   ins_pipe( fpu_reg_con );
7174 %}
7175 
7176 // The instruction usage is guarded by predicate in operand immXD().
7177 instruct loadConXD(regXD dst, immXD con) %{
7178   match(Set dst con);
7179   ins_cost(125);
7180   format %{ "MOVSD  $dst,[$con]" %}
7181   ins_encode(load_conXD(dst, con));
7182   ins_pipe( pipe_slow );
7183 %}
7184 
7185 // The instruction usage is guarded by predicate in operand immXD0().
7186 instruct loadConXD0(regXD dst, immXD0 src) %{
7187   match(Set dst src);
7188   ins_cost(100);
7189   format %{ "XORPD  $dst,$dst\t# double 0.0" %}
7190   ins_encode( Opcode(0x66), Opcode(0x0F), Opcode(0x57), RegReg(dst,dst));
7191   ins_pipe( pipe_slow );
7192 %}
7193 
7194 // Load Stack Slot
7195 instruct loadSSI(eRegI dst, stackSlotI src) %{
7196   match(Set dst src);
7197   ins_cost(125);
7198 
7199   format %{ "MOV    $dst,$src" %}
7200   opcode(0x8B);
7201   ins_encode( OpcP, RegMem(dst,src));
7202   ins_pipe( ialu_reg_mem );
7203 %}
7204 
7205 instruct loadSSL(eRegL dst, stackSlotL src) %{
7206   match(Set dst src);
7207 
7208   ins_cost(200);
7209   format %{ "MOV    $dst,$src.lo\n\t"
7210             "MOV    $dst+4,$src.hi" %}
7211   opcode(0x8B, 0x8B);
7212   ins_encode( OpcP, RegMem( dst, src ), OpcS, RegMem_Hi( dst, src ) );
7213   ins_pipe( ialu_mem_long_reg );
7214 %}
7215 
7216 // Load Stack Slot
7217 instruct loadSSP(eRegP dst, stackSlotP src) %{
7218   match(Set dst src);
7219   ins_cost(125);
7220 
7221   format %{ "MOV    $dst,$src" %}
7222   opcode(0x8B);
7223   ins_encode( OpcP, RegMem(dst,src));
7224   ins_pipe( ialu_reg_mem );
7225 %}
7226 
7227 // Load Stack Slot
7228 instruct loadSSF(regF dst, stackSlotF src) %{
7229   match(Set dst src);
7230   ins_cost(125);
7231 
7232   format %{ "FLD_S  $src\n\t"
7233             "FSTP   $dst" %}
7234   opcode(0xD9);               /* D9 /0, FLD m32real */
7235   ins_encode( OpcP, RMopc_Mem_no_oop(0x00,src),
7236               Pop_Reg_F(dst) );
7237   ins_pipe( fpu_reg_mem );
7238 %}
7239 
7240 // Load Stack Slot
7241 instruct loadSSD(regD dst, stackSlotD src) %{
7242   match(Set dst src);
7243   ins_cost(125);
7244 
7245   format %{ "FLD_D  $src\n\t"
7246             "FSTP   $dst" %}
7247   opcode(0xDD);               /* DD /0, FLD m64real */
7248   ins_encode( OpcP, RMopc_Mem_no_oop(0x00,src),
7249               Pop_Reg_D(dst) );
7250   ins_pipe( fpu_reg_mem );
7251 %}
7252 
7253 // Prefetch instructions.
7254 // Must be safe to execute with invalid address (cannot fault).
7255 
7256 instruct prefetchr0( memory mem ) %{
7257   predicate(UseSSE==0 && !VM_Version::supports_3dnow());
7258   match(PrefetchRead mem);
7259   ins_cost(0);
7260   size(0);
7261   format %{ "PREFETCHR (non-SSE is empty encoding)" %}
7262   ins_encode();
7263   ins_pipe(empty);
7264 %}
7265 
7266 instruct prefetchr( memory mem ) %{
7267   predicate(UseSSE==0 && VM_Version::supports_3dnow() || ReadPrefetchInstr==3);
7268   match(PrefetchRead mem);
7269   ins_cost(100);
7270 
7271   format %{ "PREFETCHR $mem\t! Prefetch into level 1 cache for read" %}
7272   opcode(0x0F, 0x0d);     /* Opcode 0F 0d /0 */
7273   ins_encode(OpcP, OpcS, RMopc_Mem(0x00,mem));
7274   ins_pipe(ialu_mem);
7275 %}
7276 
7277 instruct prefetchrNTA( memory mem ) %{
7278   predicate(UseSSE>=1 && ReadPrefetchInstr==0);
7279   match(PrefetchRead mem);
7280   ins_cost(100);
7281 
7282   format %{ "PREFETCHNTA $mem\t! Prefetch into non-temporal cache for read" %}
7283   opcode(0x0F, 0x18);     /* Opcode 0F 18 /0 */
7284   ins_encode(OpcP, OpcS, RMopc_Mem(0x00,mem));
7285   ins_pipe(ialu_mem);
7286 %}
7287 
7288 instruct prefetchrT0( memory mem ) %{
7289   predicate(UseSSE>=1 && ReadPrefetchInstr==1);
7290   match(PrefetchRead mem);
7291   ins_cost(100);
7292 
7293   format %{ "PREFETCHT0 $mem\t! Prefetch into L1 and L2 caches for read" %}
7294   opcode(0x0F, 0x18);     /* Opcode 0F 18 /1 */
7295   ins_encode(OpcP, OpcS, RMopc_Mem(0x01,mem));
7296   ins_pipe(ialu_mem);
7297 %}
7298 
7299 instruct prefetchrT2( memory mem ) %{
7300   predicate(UseSSE>=1 && ReadPrefetchInstr==2);
7301   match(PrefetchRead mem);
7302   ins_cost(100);
7303 
7304   format %{ "PREFETCHT2 $mem\t! Prefetch into L2 cache for read" %}
7305   opcode(0x0F, 0x18);     /* Opcode 0F 18 /3 */
7306   ins_encode(OpcP, OpcS, RMopc_Mem(0x03,mem));
7307   ins_pipe(ialu_mem);
7308 %}
7309 
7310 instruct prefetchw0( memory mem ) %{
7311   predicate(UseSSE==0 && !VM_Version::supports_3dnow());
7312   match(PrefetchWrite mem);
7313   ins_cost(0);
7314   size(0);
7315   format %{ "Prefetch (non-SSE is empty encoding)" %}
7316   ins_encode();
7317   ins_pipe(empty);
7318 %}
7319 
7320 instruct prefetchw( memory mem ) %{
7321   predicate(UseSSE==0 && VM_Version::supports_3dnow() || AllocatePrefetchInstr==3);
7322   match( PrefetchWrite mem );
7323   ins_cost(100);
7324 
7325   format %{ "PREFETCHW $mem\t! Prefetch into L1 cache and mark modified" %}
7326   opcode(0x0F, 0x0D);     /* Opcode 0F 0D /1 */
7327   ins_encode(OpcP, OpcS, RMopc_Mem(0x01,mem));
7328   ins_pipe(ialu_mem);
7329 %}
7330 
7331 instruct prefetchwNTA( memory mem ) %{
7332   predicate(UseSSE>=1 && AllocatePrefetchInstr==0);
7333   match(PrefetchWrite mem);
7334   ins_cost(100);
7335 
7336   format %{ "PREFETCHNTA $mem\t! Prefetch into non-temporal cache for write" %}
7337   opcode(0x0F, 0x18);     /* Opcode 0F 18 /0 */
7338   ins_encode(OpcP, OpcS, RMopc_Mem(0x00,mem));
7339   ins_pipe(ialu_mem);
7340 %}
7341 
7342 instruct prefetchwT0( memory mem ) %{
7343   predicate(UseSSE>=1 && AllocatePrefetchInstr==1);
7344   match(PrefetchWrite mem);
7345   ins_cost(100);
7346 
7347   format %{ "PREFETCHT0 $mem\t! Prefetch into L1 and L2 caches for write" %}
7348   opcode(0x0F, 0x18);     /* Opcode 0F 18 /1 */
7349   ins_encode(OpcP, OpcS, RMopc_Mem(0x01,mem));
7350   ins_pipe(ialu_mem);
7351 %}
7352 
7353 instruct prefetchwT2( memory mem ) %{
7354   predicate(UseSSE>=1 && AllocatePrefetchInstr==2);
7355   match(PrefetchWrite mem);
7356   ins_cost(100);
7357 
7358   format %{ "PREFETCHT2 $mem\t! Prefetch into L2 cache for write" %}
7359   opcode(0x0F, 0x18);     /* Opcode 0F 18 /3 */
7360   ins_encode(OpcP, OpcS, RMopc_Mem(0x03,mem));
7361   ins_pipe(ialu_mem);
7362 %}
7363 
7364 //----------Store Instructions-------------------------------------------------
7365 
7366 // Store Byte
7367 instruct storeB(memory mem, xRegI src) %{
7368   match(Set mem (StoreB mem src));
7369 
7370   ins_cost(125);
7371   format %{ "MOV8   $mem,$src" %}
7372   opcode(0x88);
7373   ins_encode( OpcP, RegMem( src, mem ) );
7374   ins_pipe( ialu_mem_reg );
7375 %}
7376 
7377 // Store Char/Short
7378 instruct storeC(memory mem, eRegI src) %{
7379   match(Set mem (StoreC mem src));
7380 
7381   ins_cost(125);
7382   format %{ "MOV16  $mem,$src" %}
7383   opcode(0x89, 0x66);
7384   ins_encode( OpcS, OpcP, RegMem( src, mem ) );
7385   ins_pipe( ialu_mem_reg );
7386 %}
7387 
7388 // Store Integer
7389 instruct storeI(memory mem, eRegI src) %{
7390   match(Set mem (StoreI mem src));
7391 
7392   ins_cost(125);
7393   format %{ "MOV    $mem,$src" %}
7394   opcode(0x89);
7395   ins_encode( OpcP, RegMem( src, mem ) );
7396   ins_pipe( ialu_mem_reg );
7397 %}
7398 
7399 // Store Long
7400 instruct storeL(long_memory mem, eRegL src) %{
7401   predicate(!((StoreLNode*)n)->require_atomic_access());
7402   match(Set mem (StoreL mem src));
7403 
7404   ins_cost(200);
7405   format %{ "MOV    $mem,$src.lo\n\t"
7406             "MOV    $mem+4,$src.hi" %}
7407   opcode(0x89, 0x89);
7408   ins_encode( OpcP, RegMem( src, mem ), OpcS, RegMem_Hi( src, mem ) );
7409   ins_pipe( ialu_mem_long_reg );
7410 %}
7411 
7412 // Store Long to Integer
7413 instruct storeL2I(memory mem, eRegL src) %{
7414   match(Set mem (StoreI mem (ConvL2I src)));
7415 
7416   format %{ "MOV    $mem,$src.lo\t# long -> int" %}
7417   ins_encode %{
7418     __ movl($mem$$Address, $src$$Register);
7419   %}
7420   ins_pipe(ialu_mem_reg);
7421 %}
7422 
7423 // Volatile Store Long.  Must be atomic, so move it into
7424 // the FP TOS and then do a 64-bit FIST.  Has to probe the
7425 // target address before the store (for null-ptr checks)
7426 // so the memory operand is used twice in the encoding.
7427 instruct storeL_volatile(memory mem, stackSlotL src, eFlagsReg cr ) %{
7428   predicate(UseSSE<=1 && ((StoreLNode*)n)->require_atomic_access());
7429   match(Set mem (StoreL mem src));
7430   effect( KILL cr );
7431   ins_cost(400);
7432   format %{ "CMP    $mem,EAX\t# Probe address for implicit null check\n\t"
7433             "FILD   $src\n\t"
7434             "FISTp  $mem\t # 64-bit atomic volatile long store" %}
7435   opcode(0x3B);
7436   ins_encode( OpcP, RegMem( EAX, mem ), enc_storeL_volatile(mem,src));
7437   ins_pipe( fpu_reg_mem );
7438 %}
7439 
7440 instruct storeLX_volatile(memory mem, stackSlotL src, regXD tmp, eFlagsReg cr) %{
7441   predicate(UseSSE>=2 && ((StoreLNode*)n)->require_atomic_access());
7442   match(Set mem (StoreL mem src));
7443   effect( TEMP tmp, KILL cr );
7444   ins_cost(380);
7445   format %{ "CMP    $mem,EAX\t# Probe address for implicit null check\n\t"
7446             "MOVSD  $tmp,$src\n\t"
7447             "MOVSD  $mem,$tmp\t # 64-bit atomic volatile long store" %}
7448   opcode(0x3B);
7449   ins_encode( OpcP, RegMem( EAX, mem ), enc_storeLX_volatile(mem, src, tmp));
7450   ins_pipe( pipe_slow );
7451 %}
7452 
7453 instruct storeLX_reg_volatile(memory mem, eRegL src, regXD tmp2, regXD tmp, eFlagsReg cr) %{
7454   predicate(UseSSE>=2 && ((StoreLNode*)n)->require_atomic_access());
7455   match(Set mem (StoreL mem src));
7456   effect( TEMP tmp2 , TEMP tmp, KILL cr );
7457   ins_cost(360);
7458   format %{ "CMP    $mem,EAX\t# Probe address for implicit null check\n\t"
7459             "MOVD   $tmp,$src.lo\n\t"
7460             "MOVD   $tmp2,$src.hi\n\t"
7461             "PUNPCKLDQ $tmp,$tmp2\n\t"
7462             "MOVSD  $mem,$tmp\t # 64-bit atomic volatile long store" %}
7463   opcode(0x3B);
7464   ins_encode( OpcP, RegMem( EAX, mem ), enc_storeLX_reg_volatile(mem, src, tmp, tmp2));
7465   ins_pipe( pipe_slow );
7466 %}
7467 
7468 // Store Pointer; for storing unknown oops and raw pointers
7469 instruct storeP(memory mem, anyRegP src) %{
7470   match(Set mem (StoreP mem src));
7471 
7472   ins_cost(125);
7473   format %{ "MOV    $mem,$src" %}
7474   opcode(0x89);
7475   ins_encode( OpcP, RegMem( src, mem ) );
7476   ins_pipe( ialu_mem_reg );
7477 %}
7478 
7479 // Store Integer Immediate
7480 instruct storeImmI(memory mem, immI src) %{
7481   match(Set mem (StoreI mem src));
7482 
7483   ins_cost(150);
7484   format %{ "MOV    $mem,$src" %}
7485   opcode(0xC7);               /* C7 /0 */
7486   ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con32( src ));
7487   ins_pipe( ialu_mem_imm );
7488 %}
7489 
7490 // Store Short/Char Immediate
7491 instruct storeImmI16(memory mem, immI16 src) %{
7492   predicate(UseStoreImmI16);
7493   match(Set mem (StoreC mem src));
7494 
7495   ins_cost(150);
7496   format %{ "MOV16  $mem,$src" %}
7497   opcode(0xC7);     /* C7 /0 Same as 32 store immediate with prefix */
7498   ins_encode( SizePrefix, OpcP, RMopc_Mem(0x00,mem),  Con16( src ));
7499   ins_pipe( ialu_mem_imm );
7500 %}
7501 
7502 // Store Pointer Immediate; null pointers or constant oops that do not
7503 // need card-mark barriers.
7504 instruct storeImmP(memory mem, immP src) %{
7505   match(Set mem (StoreP mem src));
7506 
7507   ins_cost(150);
7508   format %{ "MOV    $mem,$src" %}
7509   opcode(0xC7);               /* C7 /0 */
7510   ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con32( src ));
7511   ins_pipe( ialu_mem_imm );
7512 %}
7513 
7514 // Store Byte Immediate
7515 instruct storeImmB(memory mem, immI8 src) %{
7516   match(Set mem (StoreB mem src));
7517 
7518   ins_cost(150);
7519   format %{ "MOV8   $mem,$src" %}
7520   opcode(0xC6);               /* C6 /0 */
7521   ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con8or32( src ));
7522   ins_pipe( ialu_mem_imm );
7523 %}
7524 
7525 // Store Aligned Packed Byte XMM register to memory
7526 instruct storeA8B(memory mem, regXD src) %{
7527   predicate(UseSSE>=1);
7528   match(Set mem (Store8B mem src));
7529   ins_cost(145);
7530   format %{ "MOVQ  $mem,$src\t! packed8B" %}
7531   ins_encode( movq_st(mem, src));
7532   ins_pipe( pipe_slow );
7533 %}
7534 
7535 // Store Aligned Packed Char/Short XMM register to memory
7536 instruct storeA4C(memory mem, regXD src) %{
7537   predicate(UseSSE>=1);
7538   match(Set mem (Store4C mem src));
7539   ins_cost(145);
7540   format %{ "MOVQ  $mem,$src\t! packed4C" %}
7541   ins_encode( movq_st(mem, src));
7542   ins_pipe( pipe_slow );
7543 %}
7544 
7545 // Store Aligned Packed Integer XMM register to memory
7546 instruct storeA2I(memory mem, regXD src) %{
7547   predicate(UseSSE>=1);
7548   match(Set mem (Store2I mem src));
7549   ins_cost(145);
7550   format %{ "MOVQ  $mem,$src\t! packed2I" %}
7551   ins_encode( movq_st(mem, src));
7552   ins_pipe( pipe_slow );
7553 %}
7554 
7555 // Store CMS card-mark Immediate
7556 instruct storeImmCM(memory mem, immI8 src) %{
7557   match(Set mem (StoreCM mem src));
7558 
7559   ins_cost(150);
7560   format %{ "MOV8   $mem,$src\t! CMS card-mark imm0" %}
7561   opcode(0xC6);               /* C6 /0 */
7562   ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con8or32( src ));
7563   ins_pipe( ialu_mem_imm );
7564 %}
7565 
7566 // Store Double
7567 instruct storeD( memory mem, regDPR1 src) %{
7568   predicate(UseSSE<=1);
7569   match(Set mem (StoreD mem src));
7570 
7571   ins_cost(100);
7572   format %{ "FST_D  $mem,$src" %}
7573   opcode(0xDD);       /* DD /2 */
7574   ins_encode( enc_FP_store(mem,src) );
7575   ins_pipe( fpu_mem_reg );
7576 %}
7577 
7578 // Store double does rounding on x86
7579 instruct storeD_rounded( memory mem, regDPR1 src) %{
7580   predicate(UseSSE<=1);
7581   match(Set mem (StoreD mem (RoundDouble src)));
7582 
7583   ins_cost(100);
7584   format %{ "FST_D  $mem,$src\t# round" %}
7585   opcode(0xDD);       /* DD /2 */
7586   ins_encode( enc_FP_store(mem,src) );
7587   ins_pipe( fpu_mem_reg );
7588 %}
7589 
7590 // Store XMM register to memory (double-precision floating points)
7591 // MOVSD instruction
7592 instruct storeXD(memory mem, regXD src) %{
7593   predicate(UseSSE>=2);
7594   match(Set mem (StoreD mem src));
7595   ins_cost(95);
7596   format %{ "MOVSD  $mem,$src" %}
7597   ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x11), RegMem(src, mem));
7598   ins_pipe( pipe_slow );
7599 %}
7600 
7601 // Store XMM register to memory (single-precision floating point)
7602 // MOVSS instruction
7603 instruct storeX(memory mem, regX src) %{
7604   predicate(UseSSE>=1);
7605   match(Set mem (StoreF mem src));
7606   ins_cost(95);
7607   format %{ "MOVSS  $mem,$src" %}
7608   ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x11), RegMem(src, mem));
7609   ins_pipe( pipe_slow );
7610 %}
7611 
7612 // Store Aligned Packed Single Float XMM register to memory
7613 instruct storeA2F(memory mem, regXD src) %{
7614   predicate(UseSSE>=1);
7615   match(Set mem (Store2F mem src));
7616   ins_cost(145);
7617   format %{ "MOVQ  $mem,$src\t! packed2F" %}
7618   ins_encode( movq_st(mem, src));
7619   ins_pipe( pipe_slow );
7620 %}
7621 
7622 // Store Float
7623 instruct storeF( memory mem, regFPR1 src) %{
7624   predicate(UseSSE==0);
7625   match(Set mem (StoreF mem src));
7626 
7627   ins_cost(100);
7628   format %{ "FST_S  $mem,$src" %}
7629   opcode(0xD9);       /* D9 /2 */
7630   ins_encode( enc_FP_store(mem,src) );
7631   ins_pipe( fpu_mem_reg );
7632 %}
7633 
7634 // Store Float does rounding on x86
7635 instruct storeF_rounded( memory mem, regFPR1 src) %{
7636   predicate(UseSSE==0);
7637   match(Set mem (StoreF mem (RoundFloat src)));
7638 
7639   ins_cost(100);
7640   format %{ "FST_S  $mem,$src\t# round" %}
7641   opcode(0xD9);       /* D9 /2 */
7642   ins_encode( enc_FP_store(mem,src) );
7643   ins_pipe( fpu_mem_reg );
7644 %}
7645 
7646 // Store Float does rounding on x86
7647 instruct storeF_Drounded( memory mem, regDPR1 src) %{
7648   predicate(UseSSE<=1);
7649   match(Set mem (StoreF mem (ConvD2F src)));
7650 
7651   ins_cost(100);
7652   format %{ "FST_S  $mem,$src\t# D-round" %}
7653   opcode(0xD9);       /* D9 /2 */
7654   ins_encode( enc_FP_store(mem,src) );
7655   ins_pipe( fpu_mem_reg );
7656 %}
7657 
7658 // Store immediate Float value (it is faster than store from FPU register)
7659 // The instruction usage is guarded by predicate in operand immF().
7660 instruct storeF_imm( memory mem, immF src) %{
7661   match(Set mem (StoreF mem src));
7662 
7663   ins_cost(50);
7664   format %{ "MOV    $mem,$src\t# store float" %}
7665   opcode(0xC7);               /* C7 /0 */
7666   ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con32F_as_bits( src ));
7667   ins_pipe( ialu_mem_imm );
7668 %}
7669 
7670 // Store immediate Float value (it is faster than store from XMM register)
7671 // The instruction usage is guarded by predicate in operand immXF().
7672 instruct storeX_imm( memory mem, immXF src) %{
7673   match(Set mem (StoreF mem src));
7674 
7675   ins_cost(50);
7676   format %{ "MOV    $mem,$src\t# store float" %}
7677   opcode(0xC7);               /* C7 /0 */
7678   ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con32XF_as_bits( src ));
7679   ins_pipe( ialu_mem_imm );
7680 %}
7681 
7682 // Store Integer to stack slot
7683 instruct storeSSI(stackSlotI dst, eRegI src) %{
7684   match(Set dst src);
7685 
7686   ins_cost(100);
7687   format %{ "MOV    $dst,$src" %}
7688   opcode(0x89);
7689   ins_encode( OpcPRegSS( dst, src ) );
7690   ins_pipe( ialu_mem_reg );
7691 %}
7692 
7693 // Store Integer to stack slot
7694 instruct storeSSP(stackSlotP dst, eRegP src) %{
7695   match(Set dst src);
7696 
7697   ins_cost(100);
7698   format %{ "MOV    $dst,$src" %}
7699   opcode(0x89);
7700   ins_encode( OpcPRegSS( dst, src ) );
7701   ins_pipe( ialu_mem_reg );
7702 %}
7703 
7704 // Store Long to stack slot
7705 instruct storeSSL(stackSlotL dst, eRegL src) %{
7706   match(Set dst src);
7707 
7708   ins_cost(200);
7709   format %{ "MOV    $dst,$src.lo\n\t"
7710             "MOV    $dst+4,$src.hi" %}
7711   opcode(0x89, 0x89);
7712   ins_encode( OpcP, RegMem( src, dst ), OpcS, RegMem_Hi( src, dst ) );
7713   ins_pipe( ialu_mem_long_reg );
7714 %}
7715 
7716 //----------MemBar Instructions-----------------------------------------------
7717 // Memory barrier flavors
7718 
7719 instruct membar_acquire() %{
7720   match(MemBarAcquire);
7721   ins_cost(400);
7722 
7723   size(0);
7724   format %{ "MEMBAR-acquire ! (empty encoding)" %}
7725   ins_encode();
7726   ins_pipe(empty);
7727 %}
7728 
7729 instruct membar_acquire_lock() %{
7730   match(MemBarAcquire);
7731   predicate(Matcher::prior_fast_lock(n));
7732   ins_cost(0);
7733 
7734   size(0);
7735   format %{ "MEMBAR-acquire (prior CMPXCHG in FastLock so empty encoding)" %}
7736   ins_encode( );
7737   ins_pipe(empty);
7738 %}
7739 
7740 instruct membar_release() %{
7741   match(MemBarRelease);
7742   ins_cost(400);
7743 
7744   size(0);
7745   format %{ "MEMBAR-release ! (empty encoding)" %}
7746   ins_encode( );
7747   ins_pipe(empty);
7748 %}
7749 
7750 instruct membar_release_lock() %{
7751   match(MemBarRelease);
7752   predicate(Matcher::post_fast_unlock(n));
7753   ins_cost(0);
7754 
7755   size(0);
7756   format %{ "MEMBAR-release (a FastUnlock follows so empty encoding)" %}
7757   ins_encode( );
7758   ins_pipe(empty);
7759 %}
7760 
7761 instruct membar_volatile(eFlagsReg cr) %{
7762   match(MemBarVolatile);
7763   effect(KILL cr);
7764   ins_cost(400);
7765 
7766   format %{ 
7767     $$template
7768     if (os::is_MP()) {
7769       $$emit$$"LOCK ADDL [ESP + #0], 0\t! membar_volatile"
7770     } else {
7771       $$emit$$"MEMBAR-volatile ! (empty encoding)"
7772     }
7773   %}
7774   ins_encode %{
7775     __ membar(Assembler::StoreLoad);
7776   %}
7777   ins_pipe(pipe_slow);
7778 %}
7779 
7780 instruct unnecessary_membar_volatile() %{
7781   match(MemBarVolatile);
7782   predicate(Matcher::post_store_load_barrier(n));
7783   ins_cost(0);
7784 
7785   size(0);
7786   format %{ "MEMBAR-volatile (unnecessary so empty encoding)" %}
7787   ins_encode( );
7788   ins_pipe(empty);
7789 %}
7790 
7791 //----------Move Instructions--------------------------------------------------
7792 instruct castX2P(eAXRegP dst, eAXRegI src) %{
7793   match(Set dst (CastX2P src));
7794   format %{ "# X2P  $dst, $src" %}
7795   ins_encode( /*empty encoding*/ );
7796   ins_cost(0);
7797   ins_pipe(empty);
7798 %}
7799 
7800 instruct castP2X(eRegI dst, eRegP src ) %{
7801   match(Set dst (CastP2X src));
7802   ins_cost(50);
7803   format %{ "MOV    $dst, $src\t# CastP2X" %}
7804   ins_encode( enc_Copy( dst, src) );
7805   ins_pipe( ialu_reg_reg );
7806 %}
7807 
7808 //----------Conditional Move---------------------------------------------------
7809 // Conditional move
7810 instruct cmovI_reg(eRegI dst, eRegI src, eFlagsReg cr, cmpOp cop ) %{
7811   predicate(VM_Version::supports_cmov() );
7812   match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
7813   ins_cost(200);
7814   format %{ "CMOV$cop $dst,$src" %}
7815   opcode(0x0F,0x40);
7816   ins_encode( enc_cmov(cop), RegReg( dst, src ) );
7817   ins_pipe( pipe_cmov_reg );
7818 %}
7819 
7820 instruct cmovI_regU( cmpOpU cop, eFlagsRegU cr, eRegI dst, eRegI src ) %{
7821   predicate(VM_Version::supports_cmov() );
7822   match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
7823   ins_cost(200);
7824   format %{ "CMOV$cop $dst,$src" %}
7825   opcode(0x0F,0x40);
7826   ins_encode( enc_cmov(cop), RegReg( dst, src ) );
7827   ins_pipe( pipe_cmov_reg );
7828 %}
7829 
7830 instruct cmovI_regUCF( cmpOpUCF cop, eFlagsRegUCF cr, eRegI dst, eRegI src ) %{
7831   predicate(VM_Version::supports_cmov() );
7832   match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
7833   ins_cost(200);
7834   expand %{
7835     cmovI_regU(cop, cr, dst, src);
7836   %}
7837 %}
7838 
7839 // Conditional move
7840 instruct cmovI_mem(cmpOp cop, eFlagsReg cr, eRegI dst, memory src) %{
7841   predicate(VM_Version::supports_cmov() );
7842   match(Set dst (CMoveI (Binary cop cr) (Binary dst (LoadI src))));
7843   ins_cost(250);
7844   format %{ "CMOV$cop $dst,$src" %}
7845   opcode(0x0F,0x40);
7846   ins_encode( enc_cmov(cop), RegMem( dst, src ) );
7847   ins_pipe( pipe_cmov_mem );
7848 %}
7849 
7850 // Conditional move
7851 instruct cmovI_memU(cmpOpU cop, eFlagsRegU cr, eRegI dst, memory src) %{
7852   predicate(VM_Version::supports_cmov() );
7853   match(Set dst (CMoveI (Binary cop cr) (Binary dst (LoadI src))));
7854   ins_cost(250);
7855   format %{ "CMOV$cop $dst,$src" %}
7856   opcode(0x0F,0x40);
7857   ins_encode( enc_cmov(cop), RegMem( dst, src ) );
7858   ins_pipe( pipe_cmov_mem );
7859 %}
7860 
7861 instruct cmovI_memUCF(cmpOpUCF cop, eFlagsRegUCF cr, eRegI dst, memory src) %{
7862   predicate(VM_Version::supports_cmov() );
7863   match(Set dst (CMoveI (Binary cop cr) (Binary dst (LoadI src))));
7864   ins_cost(250);
7865   expand %{
7866     cmovI_memU(cop, cr, dst, src);
7867   %}
7868 %}
7869 
7870 // Conditional move
7871 instruct cmovP_reg(eRegP dst, eRegP src, eFlagsReg cr, cmpOp cop ) %{
7872   predicate(VM_Version::supports_cmov() );
7873   match(Set dst (CMoveP (Binary cop cr) (Binary dst src)));
7874   ins_cost(200);
7875   format %{ "CMOV$cop $dst,$src\t# ptr" %}
7876   opcode(0x0F,0x40);
7877   ins_encode( enc_cmov(cop), RegReg( dst, src ) );
7878   ins_pipe( pipe_cmov_reg );
7879 %}
7880 
7881 // Conditional move (non-P6 version)
7882 // Note:  a CMoveP is generated for  stubs and native wrappers
7883 //        regardless of whether we are on a P6, so we
7884 //        emulate a cmov here
7885 instruct cmovP_reg_nonP6(eRegP dst, eRegP src, eFlagsReg cr, cmpOp cop ) %{
7886   match(Set dst (CMoveP (Binary cop cr) (Binary dst src)));
7887   ins_cost(300);
7888   format %{ "Jn$cop   skip\n\t"
7889           "MOV    $dst,$src\t# pointer\n"
7890       "skip:" %}
7891   opcode(0x8b);
7892   ins_encode( enc_cmov_branch(cop, 0x2), OpcP, RegReg(dst, src));
7893   ins_pipe( pipe_cmov_reg );
7894 %}
7895 
7896 // Conditional move
7897 instruct cmovP_regU(cmpOpU cop, eFlagsRegU cr, eRegP dst, eRegP src ) %{
7898   predicate(VM_Version::supports_cmov() );
7899   match(Set dst (CMoveP (Binary cop cr) (Binary dst src)));
7900   ins_cost(200);
7901   format %{ "CMOV$cop $dst,$src\t# ptr" %}
7902   opcode(0x0F,0x40);
7903   ins_encode( enc_cmov(cop), RegReg( dst, src ) );
7904   ins_pipe( pipe_cmov_reg );
7905 %}
7906 
7907 instruct cmovP_regUCF(cmpOpUCF cop, eFlagsRegUCF cr, eRegP dst, eRegP src ) %{
7908   predicate(VM_Version::supports_cmov() );
7909   match(Set dst (CMoveP (Binary cop cr) (Binary dst src)));
7910   ins_cost(200);
7911   expand %{
7912     cmovP_regU(cop, cr, dst, src);
7913   %}
7914 %}
7915 
7916 // DISABLED: Requires the ADLC to emit a bottom_type call that
7917 // correctly meets the two pointer arguments; one is an incoming
7918 // register but the other is a memory operand.  ALSO appears to
7919 // be buggy with implicit null checks.
7920 //
7921 //// Conditional move
7922 //instruct cmovP_mem(cmpOp cop, eFlagsReg cr, eRegP dst, memory src) %{
7923 //  predicate(VM_Version::supports_cmov() );
7924 //  match(Set dst (CMoveP (Binary cop cr) (Binary dst (LoadP src))));
7925 //  ins_cost(250);
7926 //  format %{ "CMOV$cop $dst,$src\t# ptr" %}
7927 //  opcode(0x0F,0x40);
7928 //  ins_encode( enc_cmov(cop), RegMem( dst, src ) );
7929 //  ins_pipe( pipe_cmov_mem );
7930 //%}
7931 //
7932 //// Conditional move
7933 //instruct cmovP_memU(cmpOpU cop, eFlagsRegU cr, eRegP dst, memory src) %{
7934 //  predicate(VM_Version::supports_cmov() );
7935 //  match(Set dst (CMoveP (Binary cop cr) (Binary dst (LoadP src))));
7936 //  ins_cost(250);
7937 //  format %{ "CMOV$cop $dst,$src\t# ptr" %}
7938 //  opcode(0x0F,0x40);
7939 //  ins_encode( enc_cmov(cop), RegMem( dst, src ) );
7940 //  ins_pipe( pipe_cmov_mem );
7941 //%}
7942 
7943 // Conditional move
7944 instruct fcmovD_regU(cmpOp_fcmov cop, eFlagsRegU cr, regDPR1 dst, regD src) %{
7945   predicate(UseSSE<=1);
7946   match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
7947   ins_cost(200);
7948   format %{ "FCMOV$cop $dst,$src\t# double" %}
7949   opcode(0xDA);
7950   ins_encode( enc_cmov_d(cop,src) );
7951   ins_pipe( pipe_cmovD_reg );
7952 %}
7953 
7954 // Conditional move
7955 instruct fcmovF_regU(cmpOp_fcmov cop, eFlagsRegU cr, regFPR1 dst, regF src) %{
7956   predicate(UseSSE==0);
7957   match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
7958   ins_cost(200);
7959   format %{ "FCMOV$cop $dst,$src\t# float" %}
7960   opcode(0xDA);
7961   ins_encode( enc_cmov_d(cop,src) );
7962   ins_pipe( pipe_cmovD_reg );
7963 %}
7964 
7965 // Float CMOV on Intel doesn't handle *signed* compares, only unsigned.
7966 instruct fcmovD_regS(cmpOp cop, eFlagsReg cr, regD dst, regD src) %{
7967   predicate(UseSSE<=1);
7968   match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
7969   ins_cost(200);
7970   format %{ "Jn$cop   skip\n\t"
7971             "MOV    $dst,$src\t# double\n"
7972       "skip:" %}
7973   opcode (0xdd, 0x3);     /* DD D8+i or DD /3 */
7974   ins_encode( enc_cmov_branch( cop, 0x4 ), Push_Reg_D(src), OpcP, RegOpc(dst) );
7975   ins_pipe( pipe_cmovD_reg );
7976 %}
7977 
7978 // Float CMOV on Intel doesn't handle *signed* compares, only unsigned.
7979 instruct fcmovF_regS(cmpOp cop, eFlagsReg cr, regF dst, regF src) %{
7980   predicate(UseSSE==0);
7981   match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
7982   ins_cost(200);
7983   format %{ "Jn$cop    skip\n\t"
7984             "MOV    $dst,$src\t# float\n"
7985       "skip:" %}
7986   opcode (0xdd, 0x3);     /* DD D8+i or DD /3 */
7987   ins_encode( enc_cmov_branch( cop, 0x4 ), Push_Reg_F(src), OpcP, RegOpc(dst) );
7988   ins_pipe( pipe_cmovD_reg );
7989 %}
7990 
7991 // No CMOVE with SSE/SSE2
7992 instruct fcmovX_regS(cmpOp cop, eFlagsReg cr, regX dst, regX src) %{
7993   predicate (UseSSE>=1);
7994   match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
7995   ins_cost(200);
7996   format %{ "Jn$cop   skip\n\t"
7997             "MOVSS  $dst,$src\t# float\n"
7998       "skip:" %}
7999   ins_encode %{
8000     Label skip;
8001     // Invert sense of branch from sense of CMOV
8002     __ jccb((Assembler::Condition)($cop$$cmpcode^1), skip);
8003     __ movflt($dst$$XMMRegister, $src$$XMMRegister);
8004     __ bind(skip);
8005   %}
8006   ins_pipe( pipe_slow );
8007 %}
8008 
8009 // No CMOVE with SSE/SSE2
8010 instruct fcmovXD_regS(cmpOp cop, eFlagsReg cr, regXD dst, regXD src) %{
8011   predicate (UseSSE>=2);
8012   match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
8013   ins_cost(200);
8014   format %{ "Jn$cop   skip\n\t"
8015             "MOVSD  $dst,$src\t# float\n"
8016       "skip:" %}
8017   ins_encode %{
8018     Label skip;
8019     // Invert sense of branch from sense of CMOV
8020     __ jccb((Assembler::Condition)($cop$$cmpcode^1), skip);
8021     __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
8022     __ bind(skip);
8023   %}
8024   ins_pipe( pipe_slow );
8025 %}
8026 
8027 // unsigned version
8028 instruct fcmovX_regU(cmpOpU cop, eFlagsRegU cr, regX dst, regX src) %{
8029   predicate (UseSSE>=1);
8030   match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
8031   ins_cost(200);
8032   format %{ "Jn$cop   skip\n\t"
8033             "MOVSS  $dst,$src\t# float\n"
8034       "skip:" %}
8035   ins_encode %{
8036     Label skip;
8037     // Invert sense of branch from sense of CMOV
8038     __ jccb((Assembler::Condition)($cop$$cmpcode^1), skip);
8039     __ movflt($dst$$XMMRegister, $src$$XMMRegister);
8040     __ bind(skip);
8041   %}
8042   ins_pipe( pipe_slow );
8043 %}
8044 
8045 instruct fcmovX_regUCF(cmpOpUCF cop, eFlagsRegUCF cr, regX dst, regX src) %{
8046   predicate (UseSSE>=1);
8047   match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
8048   ins_cost(200);
8049   expand %{
8050     fcmovX_regU(cop, cr, dst, src);
8051   %}
8052 %}
8053 
8054 // unsigned version
8055 instruct fcmovXD_regU(cmpOpU cop, eFlagsRegU cr, regXD dst, regXD src) %{
8056   predicate (UseSSE>=2);
8057   match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
8058   ins_cost(200);
8059   format %{ "Jn$cop   skip\n\t"
8060             "MOVSD  $dst,$src\t# float\n"
8061       "skip:" %}
8062   ins_encode %{
8063     Label skip;
8064     // Invert sense of branch from sense of CMOV
8065     __ jccb((Assembler::Condition)($cop$$cmpcode^1), skip);
8066     __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
8067     __ bind(skip);
8068   %}
8069   ins_pipe( pipe_slow );
8070 %}
8071 
8072 instruct fcmovXD_regUCF(cmpOpUCF cop, eFlagsRegUCF cr, regXD dst, regXD src) %{
8073   predicate (UseSSE>=2);
8074   match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
8075   ins_cost(200);
8076   expand %{
8077     fcmovXD_regU(cop, cr, dst, src);
8078   %}
8079 %}
8080 
8081 instruct cmovL_reg(cmpOp cop, eFlagsReg cr, eRegL dst, eRegL src) %{
8082   predicate(VM_Version::supports_cmov() );
8083   match(Set dst (CMoveL (Binary cop cr) (Binary dst src)));
8084   ins_cost(200);
8085   format %{ "CMOV$cop $dst.lo,$src.lo\n\t"
8086             "CMOV$cop $dst.hi,$src.hi" %}
8087   opcode(0x0F,0x40);
8088   ins_encode( enc_cmov(cop), RegReg_Lo2( dst, src ), enc_cmov(cop), RegReg_Hi2( dst, src ) );
8089   ins_pipe( pipe_cmov_reg_long );
8090 %}
8091 
8092 instruct cmovL_regU(cmpOpU cop, eFlagsRegU cr, eRegL dst, eRegL src) %{
8093   predicate(VM_Version::supports_cmov() );
8094   match(Set dst (CMoveL (Binary cop cr) (Binary dst src)));
8095   ins_cost(200);
8096   format %{ "CMOV$cop $dst.lo,$src.lo\n\t"
8097             "CMOV$cop $dst.hi,$src.hi" %}
8098   opcode(0x0F,0x40);
8099   ins_encode( enc_cmov(cop), RegReg_Lo2( dst, src ), enc_cmov(cop), RegReg_Hi2( dst, src ) );
8100   ins_pipe( pipe_cmov_reg_long );
8101 %}
8102 
8103 instruct cmovL_regUCF(cmpOpUCF cop, eFlagsRegUCF cr, eRegL dst, eRegL src) %{
8104   predicate(VM_Version::supports_cmov() );
8105   match(Set dst (CMoveL (Binary cop cr) (Binary dst src)));
8106   ins_cost(200);
8107   expand %{
8108     cmovL_regU(cop, cr, dst, src);
8109   %}
8110 %}
8111 
8112 //----------Arithmetic Instructions--------------------------------------------
8113 //----------Addition Instructions----------------------------------------------
8114 // Integer Addition Instructions
8115 instruct addI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
8116   match(Set dst (AddI dst src));
8117   effect(KILL cr);
8118 
8119   size(2);
8120   format %{ "ADD    $dst,$src" %}
8121   opcode(0x03);
8122   ins_encode( OpcP, RegReg( dst, src) );
8123   ins_pipe( ialu_reg_reg );
8124 %}
8125 
8126 instruct addI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
8127   match(Set dst (AddI dst src));
8128   effect(KILL cr);
8129 
8130   format %{ "ADD    $dst,$src" %}
8131   opcode(0x81, 0x00); /* /0 id */
8132   ins_encode( OpcSErm( dst, src ), Con8or32( src ) );
8133   ins_pipe( ialu_reg );
8134 %}
8135 
8136 instruct incI_eReg(eRegI dst, immI1 src, eFlagsReg cr) %{
8137   predicate(UseIncDec);
8138   match(Set dst (AddI dst src));
8139   effect(KILL cr);
8140 
8141   size(1);
8142   format %{ "INC    $dst" %}
8143   opcode(0x40); /*  */
8144   ins_encode( Opc_plus( primary, dst ) );
8145   ins_pipe( ialu_reg );
8146 %}
8147 
8148 instruct leaI_eReg_immI(eRegI dst, eRegI src0, immI src1) %{
8149   match(Set dst (AddI src0 src1));
8150   ins_cost(110);
8151 
8152   format %{ "LEA    $dst,[$src0 + $src1]" %}
8153   opcode(0x8D); /* 0x8D /r */
8154   ins_encode( OpcP, RegLea( dst, src0, src1 ) );
8155   ins_pipe( ialu_reg_reg );
8156 %}
8157 
8158 instruct leaP_eReg_immI(eRegP dst, eRegP src0, immI src1) %{
8159   match(Set dst (AddP src0 src1));
8160   ins_cost(110);
8161 
8162   format %{ "LEA    $dst,[$src0 + $src1]\t# ptr" %}
8163   opcode(0x8D); /* 0x8D /r */
8164   ins_encode( OpcP, RegLea( dst, src0, src1 ) );
8165   ins_pipe( ialu_reg_reg );
8166 %}
8167 
8168 instruct decI_eReg(eRegI dst, immI_M1 src, eFlagsReg cr) %{
8169   predicate(UseIncDec);
8170   match(Set dst (AddI dst src));
8171   effect(KILL cr);
8172 
8173   size(1);
8174   format %{ "DEC    $dst" %}
8175   opcode(0x48); /*  */
8176   ins_encode( Opc_plus( primary, dst ) );
8177   ins_pipe( ialu_reg );
8178 %}
8179 
8180 instruct addP_eReg(eRegP dst, eRegI src, eFlagsReg cr) %{
8181   match(Set dst (AddP dst src));
8182   effect(KILL cr);
8183 
8184   size(2);
8185   format %{ "ADD    $dst,$src" %}
8186   opcode(0x03);
8187   ins_encode( OpcP, RegReg( dst, src) );
8188   ins_pipe( ialu_reg_reg );
8189 %}
8190 
8191 instruct addP_eReg_imm(eRegP dst, immI src, eFlagsReg cr) %{
8192   match(Set dst (AddP dst src));
8193   effect(KILL cr);
8194 
8195   format %{ "ADD    $dst,$src" %}
8196   opcode(0x81,0x00); /* Opcode 81 /0 id */
8197   // ins_encode( RegImm( dst, src) );
8198   ins_encode( OpcSErm( dst, src ), Con8or32( src ) );
8199   ins_pipe( ialu_reg );
8200 %}
8201 
8202 instruct addI_eReg_mem(eRegI dst, memory src, eFlagsReg cr) %{
8203   match(Set dst (AddI dst (LoadI src)));
8204   effect(KILL cr);
8205 
8206   ins_cost(125);
8207   format %{ "ADD    $dst,$src" %}
8208   opcode(0x03);
8209   ins_encode( OpcP, RegMem( dst, src) );
8210   ins_pipe( ialu_reg_mem );
8211 %}
8212 
8213 instruct addI_mem_eReg(memory dst, eRegI src, eFlagsReg cr) %{
8214   match(Set dst (StoreI dst (AddI (LoadI dst) src)));
8215   effect(KILL cr);
8216 
8217   ins_cost(150);
8218   format %{ "ADD    $dst,$src" %}
8219   opcode(0x01);  /* Opcode 01 /r */
8220   ins_encode( OpcP, RegMem( src, dst ) );
8221   ins_pipe( ialu_mem_reg );
8222 %}
8223 
8224 // Add Memory with Immediate
8225 instruct addI_mem_imm(memory dst, immI src, eFlagsReg cr) %{
8226   match(Set dst (StoreI dst (AddI (LoadI dst) src)));
8227   effect(KILL cr);
8228 
8229   ins_cost(125);
8230   format %{ "ADD    $dst,$src" %}
8231   opcode(0x81);               /* Opcode 81 /0 id */
8232   ins_encode( OpcSE( src ), RMopc_Mem(0x00,dst), Con8or32( src ) );
8233   ins_pipe( ialu_mem_imm );
8234 %}
8235 
8236 instruct incI_mem(memory dst, immI1 src, eFlagsReg cr) %{
8237   match(Set dst (StoreI dst (AddI (LoadI dst) src)));
8238   effect(KILL cr);
8239 
8240   ins_cost(125);
8241   format %{ "INC    $dst" %}
8242   opcode(0xFF);               /* Opcode FF /0 */
8243   ins_encode( OpcP, RMopc_Mem(0x00,dst));
8244   ins_pipe( ialu_mem_imm );
8245 %}
8246 
8247 instruct decI_mem(memory dst, immI_M1 src, eFlagsReg cr) %{
8248   match(Set dst (StoreI dst (AddI (LoadI dst) src)));
8249   effect(KILL cr);
8250 
8251   ins_cost(125);
8252   format %{ "DEC    $dst" %}
8253   opcode(0xFF);               /* Opcode FF /1 */
8254   ins_encode( OpcP, RMopc_Mem(0x01,dst));
8255   ins_pipe( ialu_mem_imm );
8256 %}
8257 
8258 
8259 instruct checkCastPP( eRegP dst ) %{
8260   match(Set dst (CheckCastPP dst));
8261 
8262   size(0);
8263   format %{ "#checkcastPP of $dst" %}
8264   ins_encode( /*empty encoding*/ );
8265   ins_pipe( empty );
8266 %}
8267 
8268 instruct castPP( eRegP dst ) %{
8269   match(Set dst (CastPP dst));
8270   format %{ "#castPP of $dst" %}
8271   ins_encode( /*empty encoding*/ );
8272   ins_pipe( empty );
8273 %}
8274 
8275 instruct castII( eRegI dst ) %{
8276   match(Set dst (CastII dst));
8277   format %{ "#castII of $dst" %}
8278   ins_encode( /*empty encoding*/ );
8279   ins_cost(0);
8280   ins_pipe( empty );
8281 %}
8282 
8283 
8284 // Load-locked - same as a regular pointer load when used with compare-swap
8285 instruct loadPLocked(eRegP dst, memory mem) %{
8286   match(Set dst (LoadPLocked mem));
8287 
8288   ins_cost(125);
8289   format %{ "MOV    $dst,$mem\t# Load ptr. locked" %}
8290   opcode(0x8B);
8291   ins_encode( OpcP, RegMem(dst,mem));
8292   ins_pipe( ialu_reg_mem );
8293 %}
8294 
8295 // LoadLong-locked - same as a volatile long load when used with compare-swap
8296 instruct loadLLocked(stackSlotL dst, load_long_memory mem) %{
8297   predicate(UseSSE<=1);
8298   match(Set dst (LoadLLocked mem));
8299 
8300   ins_cost(200);
8301   format %{ "FILD   $mem\t# Atomic volatile long load\n\t"
8302             "FISTp  $dst" %}
8303   ins_encode(enc_loadL_volatile(mem,dst));
8304   ins_pipe( fpu_reg_mem );
8305 %}
8306 
8307 instruct loadLX_Locked(stackSlotL dst, load_long_memory mem, regXD tmp) %{
8308   predicate(UseSSE>=2);
8309   match(Set dst (LoadLLocked mem));
8310   effect(TEMP tmp);
8311   ins_cost(180);
8312   format %{ "MOVSD  $tmp,$mem\t# Atomic volatile long load\n\t"
8313             "MOVSD  $dst,$tmp" %}
8314   ins_encode(enc_loadLX_volatile(mem, dst, tmp));
8315   ins_pipe( pipe_slow );
8316 %}
8317 
8318 instruct loadLX_reg_Locked(eRegL dst, load_long_memory mem, regXD tmp) %{
8319   predicate(UseSSE>=2);
8320   match(Set dst (LoadLLocked mem));
8321   effect(TEMP tmp);
8322   ins_cost(160);
8323   format %{ "MOVSD  $tmp,$mem\t# Atomic volatile long load\n\t"
8324             "MOVD   $dst.lo,$tmp\n\t"
8325             "PSRLQ  $tmp,32\n\t"
8326             "MOVD   $dst.hi,$tmp" %}
8327   ins_encode(enc_loadLX_reg_volatile(mem, dst, tmp));
8328   ins_pipe( pipe_slow );
8329 %}
8330 
8331 // Conditional-store of the updated heap-top.
8332 // Used during allocation of the shared heap.
8333 // Sets flags (EQ) on success.  Implemented with a CMPXCHG on Intel.
8334 instruct storePConditional( memory heap_top_ptr, eAXRegP oldval, eRegP newval, eFlagsReg cr ) %{
8335   match(Set cr (StorePConditional heap_top_ptr (Binary oldval newval)));
8336   // EAX is killed if there is contention, but then it's also unused.
8337   // In the common case of no contention, EAX holds the new oop address.
8338   format %{ "CMPXCHG $heap_top_ptr,$newval\t# If EAX==$heap_top_ptr Then store $newval into $heap_top_ptr" %}
8339   ins_encode( lock_prefix, Opcode(0x0F), Opcode(0xB1), RegMem(newval,heap_top_ptr) );
8340   ins_pipe( pipe_cmpxchg );
8341 %}
8342 
8343 // Conditional-store of an int value.
8344 // ZF flag is set on success, reset otherwise.  Implemented with a CMPXCHG on Intel.
8345 instruct storeIConditional( memory mem, eAXRegI oldval, eRegI newval, eFlagsReg cr ) %{
8346   match(Set cr (StoreIConditional mem (Binary oldval newval)));
8347   effect(KILL oldval);
8348   format %{ "CMPXCHG $mem,$newval\t# If EAX==$mem Then store $newval into $mem" %}
8349   ins_encode( lock_prefix, Opcode(0x0F), Opcode(0xB1), RegMem(newval, mem) );
8350   ins_pipe( pipe_cmpxchg );
8351 %}
8352 
8353 // Conditional-store of a long value.
8354 // ZF flag is set on success, reset otherwise.  Implemented with a CMPXCHG8 on Intel.
8355 instruct storeLConditional( memory mem, eADXRegL oldval, eBCXRegL newval, eFlagsReg cr ) %{
8356   match(Set cr (StoreLConditional mem (Binary oldval newval)));
8357   effect(KILL oldval);
8358   format %{ "XCHG   EBX,ECX\t# correct order for CMPXCHG8 instruction\n\t"
8359             "CMPXCHG8 $mem,ECX:EBX\t# If EDX:EAX==$mem Then store ECX:EBX into $mem\n\t"
8360             "XCHG   EBX,ECX"
8361   %}
8362   ins_encode %{
8363     // Note: we need to swap rbx, and rcx before and after the
8364     //       cmpxchg8 instruction because the instruction uses
8365     //       rcx as the high order word of the new value to store but
8366     //       our register encoding uses rbx.
8367     __ xchgl(as_Register(EBX_enc), as_Register(ECX_enc));
8368     if( os::is_MP() )
8369       __ lock();
8370     __ cmpxchg8($mem$$Address);
8371     __ xchgl(as_Register(EBX_enc), as_Register(ECX_enc));
8372   %}
8373   ins_pipe( pipe_cmpxchg );
8374 %}
8375 
8376 // No flag versions for CompareAndSwap{P,I,L} because matcher can't match them
8377 
8378 instruct compareAndSwapL( eRegI res, eSIRegP mem_ptr, eADXRegL oldval, eBCXRegL newval, eFlagsReg cr ) %{
8379   match(Set res (CompareAndSwapL mem_ptr (Binary oldval newval)));
8380   effect(KILL cr, KILL oldval);
8381   format %{ "CMPXCHG8 [$mem_ptr],$newval\t# If EDX:EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t"
8382             "MOV    $res,0\n\t"
8383             "JNE,s  fail\n\t"
8384             "MOV    $res,1\n"
8385           "fail:" %}
8386   ins_encode( enc_cmpxchg8(mem_ptr),
8387               enc_flags_ne_to_boolean(res) );
8388   ins_pipe( pipe_cmpxchg );
8389 %}
8390 
8391 instruct compareAndSwapP( eRegI res,  pRegP mem_ptr, eAXRegP oldval, eCXRegP newval, eFlagsReg cr) %{
8392   match(Set res (CompareAndSwapP mem_ptr (Binary oldval newval)));
8393   effect(KILL cr, KILL oldval);
8394   format %{ "CMPXCHG [$mem_ptr],$newval\t# If EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t"
8395             "MOV    $res,0\n\t"
8396             "JNE,s  fail\n\t"
8397             "MOV    $res,1\n"
8398           "fail:" %}
8399   ins_encode( enc_cmpxchg(mem_ptr), enc_flags_ne_to_boolean(res) );
8400   ins_pipe( pipe_cmpxchg );
8401 %}
8402 
8403 instruct compareAndSwapI( eRegI res, pRegP mem_ptr, eAXRegI oldval, eCXRegI newval, eFlagsReg cr) %{
8404   match(Set res (CompareAndSwapI mem_ptr (Binary oldval newval)));
8405   effect(KILL cr, KILL oldval);
8406   format %{ "CMPXCHG [$mem_ptr],$newval\t# If EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t"
8407             "MOV    $res,0\n\t"
8408             "JNE,s  fail\n\t"
8409             "MOV    $res,1\n"
8410           "fail:" %}
8411   ins_encode( enc_cmpxchg(mem_ptr), enc_flags_ne_to_boolean(res) );
8412   ins_pipe( pipe_cmpxchg );
8413 %}
8414 
8415 //----------Subtraction Instructions-------------------------------------------
8416 // Integer Subtraction Instructions
8417 instruct subI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
8418   match(Set dst (SubI dst src));
8419   effect(KILL cr);
8420 
8421   size(2);
8422   format %{ "SUB    $dst,$src" %}
8423   opcode(0x2B);
8424   ins_encode( OpcP, RegReg( dst, src) );
8425   ins_pipe( ialu_reg_reg );
8426 %}
8427 
8428 instruct subI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
8429   match(Set dst (SubI dst src));
8430   effect(KILL cr);
8431 
8432   format %{ "SUB    $dst,$src" %}
8433   opcode(0x81,0x05);  /* Opcode 81 /5 */
8434   // ins_encode( RegImm( dst, src) );
8435   ins_encode( OpcSErm( dst, src ), Con8or32( src ) );
8436   ins_pipe( ialu_reg );
8437 %}
8438 
8439 instruct subI_eReg_mem(eRegI dst, memory src, eFlagsReg cr) %{
8440   match(Set dst (SubI dst (LoadI src)));
8441   effect(KILL cr);
8442 
8443   ins_cost(125);
8444   format %{ "SUB    $dst,$src" %}
8445   opcode(0x2B);
8446   ins_encode( OpcP, RegMem( dst, src) );
8447   ins_pipe( ialu_reg_mem );
8448 %}
8449 
8450 instruct subI_mem_eReg(memory dst, eRegI src, eFlagsReg cr) %{
8451   match(Set dst (StoreI dst (SubI (LoadI dst) src)));
8452   effect(KILL cr);
8453 
8454   ins_cost(150);
8455   format %{ "SUB    $dst,$src" %}
8456   opcode(0x29);  /* Opcode 29 /r */
8457   ins_encode( OpcP, RegMem( src, dst ) );
8458   ins_pipe( ialu_mem_reg );
8459 %}
8460 
8461 // Subtract from a pointer
8462 instruct subP_eReg(eRegP dst, eRegI src, immI0 zero, eFlagsReg cr) %{
8463   match(Set dst (AddP dst (SubI zero src)));
8464   effect(KILL cr);
8465 
8466   size(2);
8467   format %{ "SUB    $dst,$src" %}
8468   opcode(0x2B);
8469   ins_encode( OpcP, RegReg( dst, src) );
8470   ins_pipe( ialu_reg_reg );
8471 %}
8472 
8473 instruct negI_eReg(eRegI dst, immI0 zero, eFlagsReg cr) %{
8474   match(Set dst (SubI zero dst));
8475   effect(KILL cr);
8476 
8477   size(2);
8478   format %{ "NEG    $dst" %}
8479   opcode(0xF7,0x03);  // Opcode F7 /3
8480   ins_encode( OpcP, RegOpc( dst ) );
8481   ins_pipe( ialu_reg );
8482 %}
8483 
8484 
8485 //----------Multiplication/Division Instructions-------------------------------
8486 // Integer Multiplication Instructions
8487 // Multiply Register
8488 instruct mulI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
8489   match(Set dst (MulI dst src));
8490   effect(KILL cr);
8491 
8492   size(3);
8493   ins_cost(300);
8494   format %{ "IMUL   $dst,$src" %}
8495   opcode(0xAF, 0x0F);
8496   ins_encode( OpcS, OpcP, RegReg( dst, src) );
8497   ins_pipe( ialu_reg_reg_alu0 );
8498 %}
8499 
8500 // Multiply 32-bit Immediate
8501 instruct mulI_eReg_imm(eRegI dst, eRegI src, immI imm, eFlagsReg cr) %{
8502   match(Set dst (MulI src imm));
8503   effect(KILL cr);
8504 
8505   ins_cost(300);
8506   format %{ "IMUL   $dst,$src,$imm" %}
8507   opcode(0x69);  /* 69 /r id */
8508   ins_encode( OpcSE(imm), RegReg( dst, src ), Con8or32( imm ) );
8509   ins_pipe( ialu_reg_reg_alu0 );
8510 %}
8511 
8512 instruct loadConL_low_only(eADXRegL_low_only dst, immL32 src, eFlagsReg cr) %{
8513   match(Set dst src);
8514   effect(KILL cr);
8515 
8516   // Note that this is artificially increased to make it more expensive than loadConL
8517   ins_cost(250);
8518   format %{ "MOV    EAX,$src\t// low word only" %}
8519   opcode(0xB8);
8520   ins_encode( LdImmL_Lo(dst, src) );
8521   ins_pipe( ialu_reg_fat );
8522 %}
8523 
8524 // Multiply by 32-bit Immediate, taking the shifted high order results
8525 //  (special case for shift by 32)
8526 instruct mulI_imm_high(eDXRegI dst, nadxRegI src1, eADXRegL_low_only src2, immI_32 cnt, eFlagsReg cr) %{
8527   match(Set dst (ConvL2I (RShiftL (MulL (ConvI2L src1) src2) cnt)));
8528   predicate( _kids[0]->_kids[0]->_kids[1]->_leaf->Opcode() == Op_ConL &&
8529              _kids[0]->_kids[0]->_kids[1]->_leaf->as_Type()->type()->is_long()->get_con() >= min_jint &&
8530              _kids[0]->_kids[0]->_kids[1]->_leaf->as_Type()->type()->is_long()->get_con() <= max_jint );
8531   effect(USE src1, KILL cr);
8532 
8533   // Note that this is adjusted by 150 to compensate for the overcosting of loadConL_low_only
8534   ins_cost(0*100 + 1*400 - 150);
8535   format %{ "IMUL   EDX:EAX,$src1" %}
8536   ins_encode( multiply_con_and_shift_high( dst, src1, src2, cnt, cr ) );
8537   ins_pipe( pipe_slow );
8538 %}
8539 
8540 // Multiply by 32-bit Immediate, taking the shifted high order results
8541 instruct mulI_imm_RShift_high(eDXRegI dst, nadxRegI src1, eADXRegL_low_only src2, immI_32_63 cnt, eFlagsReg cr) %{
8542   match(Set dst (ConvL2I (RShiftL (MulL (ConvI2L src1) src2) cnt)));
8543   predicate( _kids[0]->_kids[0]->_kids[1]->_leaf->Opcode() == Op_ConL &&
8544              _kids[0]->_kids[0]->_kids[1]->_leaf->as_Type()->type()->is_long()->get_con() >= min_jint &&
8545              _kids[0]->_kids[0]->_kids[1]->_leaf->as_Type()->type()->is_long()->get_con() <= max_jint );
8546   effect(USE src1, KILL cr);
8547 
8548   // Note that this is adjusted by 150 to compensate for the overcosting of loadConL_low_only
8549   ins_cost(1*100 + 1*400 - 150);
8550   format %{ "IMUL   EDX:EAX,$src1\n\t"
8551             "SAR    EDX,$cnt-32" %}
8552   ins_encode( multiply_con_and_shift_high( dst, src1, src2, cnt, cr ) );
8553   ins_pipe( pipe_slow );
8554 %}
8555 
8556 // Multiply Memory 32-bit Immediate
8557 instruct mulI_mem_imm(eRegI dst, memory src, immI imm, eFlagsReg cr) %{
8558   match(Set dst (MulI (LoadI src) imm));
8559   effect(KILL cr);
8560 
8561   ins_cost(300);
8562   format %{ "IMUL   $dst,$src,$imm" %}
8563   opcode(0x69);  /* 69 /r id */
8564   ins_encode( OpcSE(imm), RegMem( dst, src ), Con8or32( imm ) );
8565   ins_pipe( ialu_reg_mem_alu0 );
8566 %}
8567 
8568 // Multiply Memory
8569 instruct mulI(eRegI dst, memory src, eFlagsReg cr) %{
8570   match(Set dst (MulI dst (LoadI src)));
8571   effect(KILL cr);
8572 
8573   ins_cost(350);
8574   format %{ "IMUL   $dst,$src" %}
8575   opcode(0xAF, 0x0F);
8576   ins_encode( OpcS, OpcP, RegMem( dst, src) );
8577   ins_pipe( ialu_reg_mem_alu0 );
8578 %}
8579 
8580 // Multiply Register Int to Long
8581 instruct mulI2L(eADXRegL dst, eAXRegI src, nadxRegI src1, eFlagsReg flags) %{
8582   // Basic Idea: long = (long)int * (long)int
8583   match(Set dst (MulL (ConvI2L src) (ConvI2L src1)));
8584   effect(DEF dst, USE src, USE src1, KILL flags);
8585 
8586   ins_cost(300);
8587   format %{ "IMUL   $dst,$src1" %}
8588 
8589   ins_encode( long_int_multiply( dst, src1 ) );
8590   ins_pipe( ialu_reg_reg_alu0 );
8591 %}
8592 
8593 instruct mulIS_eReg(eADXRegL dst, immL_32bits mask, eFlagsReg flags, eAXRegI src, nadxRegI src1) %{
8594   // Basic Idea:  long = (int & 0xffffffffL) * (int & 0xffffffffL)
8595   match(Set dst (MulL (AndL (ConvI2L src) mask) (AndL (ConvI2L src1) mask)));
8596   effect(KILL flags);
8597 
8598   ins_cost(300);
8599   format %{ "MUL    $dst,$src1" %}
8600 
8601   ins_encode( long_uint_multiply(dst, src1) );
8602   ins_pipe( ialu_reg_reg_alu0 );
8603 %}
8604 
8605 // Multiply Register Long
8606 instruct mulL_eReg(eADXRegL dst, eRegL src, eRegI tmp, eFlagsReg cr) %{
8607   match(Set dst (MulL dst src));
8608   effect(KILL cr, TEMP tmp);
8609   ins_cost(4*100+3*400);
8610 // Basic idea: lo(result) = lo(x_lo * y_lo)
8611 //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
8612   format %{ "MOV    $tmp,$src.lo\n\t"
8613             "IMUL   $tmp,EDX\n\t"
8614             "MOV    EDX,$src.hi\n\t"
8615             "IMUL   EDX,EAX\n\t"
8616             "ADD    $tmp,EDX\n\t"
8617             "MUL    EDX:EAX,$src.lo\n\t"
8618             "ADD    EDX,$tmp" %}
8619   ins_encode( long_multiply( dst, src, tmp ) );
8620   ins_pipe( pipe_slow );
8621 %}
8622 
8623 // Multiply Register Long where the left operand's high 32 bits are zero
8624 instruct mulL_eReg_lhi0(eADXRegL dst, eRegL src, eRegI tmp, eFlagsReg cr) %{
8625   predicate(is_mulL_operand_hi32_zero((MulLNode*)n, true));
8626   match(Set dst (MulL dst src));
8627   effect(KILL cr, TEMP tmp);
8628   ins_cost(2*100+2*400);
8629 // Basic idea: lo(result) = lo(x_lo * y_lo)
8630 //             hi(result) = hi(x_lo * y_lo) + lo(x_lo * y_hi) where lo(x_hi * y_lo) = 0 because x_hi = 0
8631   format %{ "MOV    $tmp,$src.hi\n\t"
8632             "IMUL   $tmp,EAX\n\t"
8633             "MUL    EDX:EAX,$src.lo\n\t"
8634             "ADD    EDX,$tmp" %}
8635   ins_encode %{
8636     __ movl($tmp$$Register, HIGH_FROM_LOW($src$$Register));
8637     __ imull($tmp$$Register, rax);
8638     __ mull($src$$Register);
8639     __ addl(rdx, $tmp$$Register);
8640   %}
8641   ins_pipe( pipe_slow );
8642 %}
8643 
8644 // Multiply Register Long where the right operand's high 32 bits are zero
8645 instruct mulL_eReg_rhi0(eADXRegL dst, eRegL src, eRegI tmp, eFlagsReg cr) %{
8646   predicate(is_mulL_operand_hi32_zero((MulLNode*)n, false));
8647   match(Set dst (MulL dst src));
8648   effect(KILL cr, TEMP tmp);
8649   ins_cost(2*100+2*400);
8650 // Basic idea: lo(result) = lo(x_lo * y_lo)
8651 //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) where lo(x_lo * y_hi) = 0 because y_hi = 0
8652   format %{ "MOV    $tmp,$src.lo\n\t"
8653             "IMUL   $tmp,EDX\n\t"
8654             "MUL    EDX:EAX,$src.lo\n\t"
8655             "ADD    EDX,$tmp" %}
8656   ins_encode %{
8657     __ movl($tmp$$Register, $src$$Register);
8658     __ imull($tmp$$Register, rdx);
8659     __ mull($src$$Register);
8660     __ addl(rdx, $tmp$$Register);
8661   %}
8662   ins_pipe( pipe_slow );
8663 %}
8664 
8665 // Multiply Register Long where the left and the right operands' high 32 bits are zero
8666 instruct mulL_eReg_hi0(eADXRegL dst, eRegL src, eFlagsReg cr) %{
8667   predicate(is_mulL_operand_hi32_zero((MulLNode*)n, true) && is_mulL_operand_hi32_zero((MulLNode*)n, false));
8668   match(Set dst (MulL dst src));
8669   effect(KILL cr);
8670   ins_cost(1*400);
8671 // Basic idea: lo(result) = lo(x_lo * y_lo)
8672 //             hi(result) = hi(x_lo * y_lo) where lo(x_hi * y_lo) = 0 and lo(x_lo * y_hi) = 0 because x_hi = 0 and y_hi = 0
8673   format %{ "MUL    EDX:EAX,$src.lo\n\t" %}
8674   ins_encode %{
8675     __ mull($src$$Register);
8676   %}
8677   ins_pipe( pipe_slow );
8678 %}
8679 
8680 // Multiply Register Long by small constant
8681 instruct mulL_eReg_con(eADXRegL dst, immL_127 src, eRegI tmp, eFlagsReg cr) %{
8682   match(Set dst (MulL dst src));
8683   effect(KILL cr, TEMP tmp);
8684   ins_cost(2*100+2*400);
8685   size(12);
8686 // Basic idea: lo(result) = lo(src * EAX)
8687 //             hi(result) = hi(src * EAX) + lo(src * EDX)
8688   format %{ "IMUL   $tmp,EDX,$src\n\t"
8689             "MOV    EDX,$src\n\t"
8690             "MUL    EDX\t# EDX*EAX -> EDX:EAX\n\t"
8691             "ADD    EDX,$tmp" %}
8692   ins_encode( long_multiply_con( dst, src, tmp ) );
8693   ins_pipe( pipe_slow );
8694 %}
8695 
8696 // Integer DIV with Register
8697 instruct divI_eReg(eAXRegI rax, eDXRegI rdx, eCXRegI div, eFlagsReg cr) %{
8698   match(Set rax (DivI rax div));
8699   effect(KILL rdx, KILL cr);
8700   size(26);
8701   ins_cost(30*100+10*100);
8702   format %{ "CMP    EAX,0x80000000\n\t"
8703             "JNE,s  normal\n\t"
8704             "XOR    EDX,EDX\n\t"
8705             "CMP    ECX,-1\n\t"
8706             "JE,s   done\n"
8707     "normal: CDQ\n\t"
8708             "IDIV   $div\n\t"
8709     "done:"        %}
8710   opcode(0xF7, 0x7);  /* Opcode F7 /7 */
8711   ins_encode( cdq_enc, OpcP, RegOpc(div) );
8712   ins_pipe( ialu_reg_reg_alu0 );
8713 %}
8714 
8715 // Divide Register Long
8716 instruct divL_eReg( eADXRegL dst, eRegL src1, eRegL src2, eFlagsReg cr, eCXRegI cx, eBXRegI bx ) %{
8717   match(Set dst (DivL src1 src2));
8718   effect( KILL cr, KILL cx, KILL bx );
8719   ins_cost(10000);
8720   format %{ "PUSH   $src1.hi\n\t"
8721             "PUSH   $src1.lo\n\t"
8722             "PUSH   $src2.hi\n\t"
8723             "PUSH   $src2.lo\n\t"
8724             "CALL   SharedRuntime::ldiv\n\t"
8725             "ADD    ESP,16" %}
8726   ins_encode( long_div(src1,src2) );
8727   ins_pipe( pipe_slow );
8728 %}
8729 
8730 // Integer DIVMOD with Register, both quotient and mod results
8731 instruct divModI_eReg_divmod(eAXRegI rax, eDXRegI rdx, eCXRegI div, eFlagsReg cr) %{
8732   match(DivModI rax div);
8733   effect(KILL cr);
8734   size(26);
8735   ins_cost(30*100+10*100);
8736   format %{ "CMP    EAX,0x80000000\n\t"
8737             "JNE,s  normal\n\t"
8738             "XOR    EDX,EDX\n\t"
8739             "CMP    ECX,-1\n\t"
8740             "JE,s   done\n"
8741     "normal: CDQ\n\t"
8742             "IDIV   $div\n\t"
8743     "done:"        %}
8744   opcode(0xF7, 0x7);  /* Opcode F7 /7 */
8745   ins_encode( cdq_enc, OpcP, RegOpc(div) );
8746   ins_pipe( pipe_slow );
8747 %}
8748 
8749 // Integer MOD with Register
8750 instruct modI_eReg(eDXRegI rdx, eAXRegI rax, eCXRegI div, eFlagsReg cr) %{
8751   match(Set rdx (ModI rax div));
8752   effect(KILL rax, KILL cr);
8753 
8754   size(26);
8755   ins_cost(300);
8756   format %{ "CDQ\n\t"
8757             "IDIV   $div" %}
8758   opcode(0xF7, 0x7);  /* Opcode F7 /7 */
8759   ins_encode( cdq_enc, OpcP, RegOpc(div) );
8760   ins_pipe( ialu_reg_reg_alu0 );
8761 %}
8762 
8763 // Remainder Register Long
8764 instruct modL_eReg( eADXRegL dst, eRegL src1, eRegL src2, eFlagsReg cr, eCXRegI cx, eBXRegI bx ) %{
8765   match(Set dst (ModL src1 src2));
8766   effect( KILL cr, KILL cx, KILL bx );
8767   ins_cost(10000);
8768   format %{ "PUSH   $src1.hi\n\t"
8769             "PUSH   $src1.lo\n\t"
8770             "PUSH   $src2.hi\n\t"
8771             "PUSH   $src2.lo\n\t"
8772             "CALL   SharedRuntime::lrem\n\t"
8773             "ADD    ESP,16" %}
8774   ins_encode( long_mod(src1,src2) );
8775   ins_pipe( pipe_slow );
8776 %}
8777 
8778 // Integer Shift Instructions
8779 // Shift Left by one
8780 instruct shlI_eReg_1(eRegI dst, immI1 shift, eFlagsReg cr) %{
8781   match(Set dst (LShiftI dst shift));
8782   effect(KILL cr);
8783 
8784   size(2);
8785   format %{ "SHL    $dst,$shift" %}
8786   opcode(0xD1, 0x4);  /* D1 /4 */
8787   ins_encode( OpcP, RegOpc( dst ) );
8788   ins_pipe( ialu_reg );
8789 %}
8790 
8791 // Shift Left by 8-bit immediate
8792 instruct salI_eReg_imm(eRegI dst, immI8 shift, eFlagsReg cr) %{
8793   match(Set dst (LShiftI dst shift));
8794   effect(KILL cr);
8795 
8796   size(3);
8797   format %{ "SHL    $dst,$shift" %}
8798   opcode(0xC1, 0x4);  /* C1 /4 ib */
8799   ins_encode( RegOpcImm( dst, shift) );
8800   ins_pipe( ialu_reg );
8801 %}
8802 
8803 // Shift Left by variable
8804 instruct salI_eReg_CL(eRegI dst, eCXRegI shift, eFlagsReg cr) %{
8805   match(Set dst (LShiftI dst shift));
8806   effect(KILL cr);
8807 
8808   size(2);
8809   format %{ "SHL    $dst,$shift" %}
8810   opcode(0xD3, 0x4);  /* D3 /4 */
8811   ins_encode( OpcP, RegOpc( dst ) );
8812   ins_pipe( ialu_reg_reg );
8813 %}
8814 
8815 // Arithmetic shift right by one
8816 instruct sarI_eReg_1(eRegI dst, immI1 shift, eFlagsReg cr) %{
8817   match(Set dst (RShiftI dst shift));
8818   effect(KILL cr);
8819 
8820   size(2);
8821   format %{ "SAR    $dst,$shift" %}
8822   opcode(0xD1, 0x7);  /* D1 /7 */
8823   ins_encode( OpcP, RegOpc( dst ) );
8824   ins_pipe( ialu_reg );
8825 %}
8826 
8827 // Arithmetic shift right by one
8828 instruct sarI_mem_1(memory dst, immI1 shift, eFlagsReg cr) %{
8829   match(Set dst (StoreI dst (RShiftI (LoadI dst) shift)));
8830   effect(KILL cr);
8831   format %{ "SAR    $dst,$shift" %}
8832   opcode(0xD1, 0x7);  /* D1 /7 */
8833   ins_encode( OpcP, RMopc_Mem(secondary,dst) );
8834   ins_pipe( ialu_mem_imm );
8835 %}
8836 
8837 // Arithmetic Shift Right by 8-bit immediate
8838 instruct sarI_eReg_imm(eRegI dst, immI8 shift, eFlagsReg cr) %{
8839   match(Set dst (RShiftI dst shift));
8840   effect(KILL cr);
8841 
8842   size(3);
8843   format %{ "SAR    $dst,$shift" %}
8844   opcode(0xC1, 0x7);  /* C1 /7 ib */
8845   ins_encode( RegOpcImm( dst, shift ) );
8846   ins_pipe( ialu_mem_imm );
8847 %}
8848 
8849 // Arithmetic Shift Right by 8-bit immediate
8850 instruct sarI_mem_imm(memory dst, immI8 shift, eFlagsReg cr) %{
8851   match(Set dst (StoreI dst (RShiftI (LoadI dst) shift)));
8852   effect(KILL cr);
8853 
8854   format %{ "SAR    $dst,$shift" %}
8855   opcode(0xC1, 0x7);  /* C1 /7 ib */
8856   ins_encode( OpcP, RMopc_Mem(secondary, dst ), Con8or32( shift ) );
8857   ins_pipe( ialu_mem_imm );
8858 %}
8859 
8860 // Arithmetic Shift Right by variable
8861 instruct sarI_eReg_CL(eRegI dst, eCXRegI shift, eFlagsReg cr) %{
8862   match(Set dst (RShiftI dst shift));
8863   effect(KILL cr);
8864 
8865   size(2);
8866   format %{ "SAR    $dst,$shift" %}
8867   opcode(0xD3, 0x7);  /* D3 /7 */
8868   ins_encode( OpcP, RegOpc( dst ) );
8869   ins_pipe( ialu_reg_reg );
8870 %}
8871 
8872 // Logical shift right by one
8873 instruct shrI_eReg_1(eRegI dst, immI1 shift, eFlagsReg cr) %{
8874   match(Set dst (URShiftI dst shift));
8875   effect(KILL cr);
8876 
8877   size(2);
8878   format %{ "SHR    $dst,$shift" %}
8879   opcode(0xD1, 0x5);  /* D1 /5 */
8880   ins_encode( OpcP, RegOpc( dst ) );
8881   ins_pipe( ialu_reg );
8882 %}
8883 
8884 // Logical Shift Right by 8-bit immediate
8885 instruct shrI_eReg_imm(eRegI dst, immI8 shift, eFlagsReg cr) %{
8886   match(Set dst (URShiftI dst shift));
8887   effect(KILL cr);
8888 
8889   size(3);
8890   format %{ "SHR    $dst,$shift" %}
8891   opcode(0xC1, 0x5);  /* C1 /5 ib */
8892   ins_encode( RegOpcImm( dst, shift) );
8893   ins_pipe( ialu_reg );
8894 %}
8895 
8896 
8897 // Logical Shift Right by 24, followed by Arithmetic Shift Left by 24.
8898 // This idiom is used by the compiler for the i2b bytecode.
8899 instruct i2b(eRegI dst, xRegI src, immI_24 twentyfour) %{
8900   match(Set dst (RShiftI (LShiftI src twentyfour) twentyfour));
8901 
8902   size(3);
8903   format %{ "MOVSX  $dst,$src :8" %}
8904   ins_encode %{
8905     __ movsbl($dst$$Register, $src$$Register);
8906   %}
8907   ins_pipe(ialu_reg_reg);
8908 %}
8909 
8910 // Logical Shift Right by 16, followed by Arithmetic Shift Left by 16.
8911 // This idiom is used by the compiler the i2s bytecode.
8912 instruct i2s(eRegI dst, xRegI src, immI_16 sixteen) %{
8913   match(Set dst (RShiftI (LShiftI src sixteen) sixteen));
8914 
8915   size(3);
8916   format %{ "MOVSX  $dst,$src :16" %}
8917   ins_encode %{
8918     __ movswl($dst$$Register, $src$$Register);
8919   %}
8920   ins_pipe(ialu_reg_reg);
8921 %}
8922 
8923 
8924 // Logical Shift Right by variable
8925 instruct shrI_eReg_CL(eRegI dst, eCXRegI shift, eFlagsReg cr) %{
8926   match(Set dst (URShiftI dst shift));
8927   effect(KILL cr);
8928 
8929   size(2);
8930   format %{ "SHR    $dst,$shift" %}
8931   opcode(0xD3, 0x5);  /* D3 /5 */
8932   ins_encode( OpcP, RegOpc( dst ) );
8933   ins_pipe( ialu_reg_reg );
8934 %}
8935 
8936 
8937 //----------Logical Instructions-----------------------------------------------
8938 //----------Integer Logical Instructions---------------------------------------
8939 // And Instructions
8940 // And Register with Register
8941 instruct andI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
8942   match(Set dst (AndI dst src));
8943   effect(KILL cr);
8944 
8945   size(2);
8946   format %{ "AND    $dst,$src" %}
8947   opcode(0x23);
8948   ins_encode( OpcP, RegReg( dst, src) );
8949   ins_pipe( ialu_reg_reg );
8950 %}
8951 
8952 // And Register with Immediate
8953 instruct andI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
8954   match(Set dst (AndI dst src));
8955   effect(KILL cr);
8956 
8957   format %{ "AND    $dst,$src" %}
8958   opcode(0x81,0x04);  /* Opcode 81 /4 */
8959   // ins_encode( RegImm( dst, src) );
8960   ins_encode( OpcSErm( dst, src ), Con8or32( src ) );
8961   ins_pipe( ialu_reg );
8962 %}
8963 
8964 // And Register with Memory
8965 instruct andI_eReg_mem(eRegI dst, memory src, eFlagsReg cr) %{
8966   match(Set dst (AndI dst (LoadI src)));
8967   effect(KILL cr);
8968 
8969   ins_cost(125);
8970   format %{ "AND    $dst,$src" %}
8971   opcode(0x23);
8972   ins_encode( OpcP, RegMem( dst, src) );
8973   ins_pipe( ialu_reg_mem );
8974 %}
8975 
8976 // And Memory with Register
8977 instruct andI_mem_eReg(memory dst, eRegI src, eFlagsReg cr) %{
8978   match(Set dst (StoreI dst (AndI (LoadI dst) src)));
8979   effect(KILL cr);
8980 
8981   ins_cost(150);
8982   format %{ "AND    $dst,$src" %}
8983   opcode(0x21);  /* Opcode 21 /r */
8984   ins_encode( OpcP, RegMem( src, dst ) );
8985   ins_pipe( ialu_mem_reg );
8986 %}
8987 
8988 // And Memory with Immediate
8989 instruct andI_mem_imm(memory dst, immI src, eFlagsReg cr) %{
8990   match(Set dst (StoreI dst (AndI (LoadI dst) src)));
8991   effect(KILL cr);
8992 
8993   ins_cost(125);
8994   format %{ "AND    $dst,$src" %}
8995   opcode(0x81, 0x4);  /* Opcode 81 /4 id */
8996   // ins_encode( MemImm( dst, src) );
8997   ins_encode( OpcSE( src ), RMopc_Mem(secondary, dst ), Con8or32( src ) );
8998   ins_pipe( ialu_mem_imm );
8999 %}
9000 
9001 // Or Instructions
9002 // Or Register with Register
9003 instruct orI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
9004   match(Set dst (OrI dst src));
9005   effect(KILL cr);
9006 
9007   size(2);
9008   format %{ "OR     $dst,$src" %}
9009   opcode(0x0B);
9010   ins_encode( OpcP, RegReg( dst, src) );
9011   ins_pipe( ialu_reg_reg );
9012 %}
9013 
9014 instruct orI_eReg_castP2X(eRegI dst, eRegP src, eFlagsReg cr) %{
9015   match(Set dst (OrI dst (CastP2X src)));
9016   effect(KILL cr);
9017 
9018   size(2);
9019   format %{ "OR     $dst,$src" %}
9020   opcode(0x0B);
9021   ins_encode( OpcP, RegReg( dst, src) );
9022   ins_pipe( ialu_reg_reg );
9023 %}
9024 
9025 
9026 // Or Register with Immediate
9027 instruct orI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
9028   match(Set dst (OrI dst src));
9029   effect(KILL cr);
9030 
9031   format %{ "OR     $dst,$src" %}
9032   opcode(0x81,0x01);  /* Opcode 81 /1 id */
9033   // ins_encode( RegImm( dst, src) );
9034   ins_encode( OpcSErm( dst, src ), Con8or32( src ) );
9035   ins_pipe( ialu_reg );
9036 %}
9037 
9038 // Or Register with Memory
9039 instruct orI_eReg_mem(eRegI dst, memory src, eFlagsReg cr) %{
9040   match(Set dst (OrI dst (LoadI src)));
9041   effect(KILL cr);
9042 
9043   ins_cost(125);
9044   format %{ "OR     $dst,$src" %}
9045   opcode(0x0B);
9046   ins_encode( OpcP, RegMem( dst, src) );
9047   ins_pipe( ialu_reg_mem );
9048 %}
9049 
9050 // Or Memory with Register
9051 instruct orI_mem_eReg(memory dst, eRegI src, eFlagsReg cr) %{
9052   match(Set dst (StoreI dst (OrI (LoadI dst) src)));
9053   effect(KILL cr);
9054 
9055   ins_cost(150);
9056   format %{ "OR     $dst,$src" %}
9057   opcode(0x09);  /* Opcode 09 /r */
9058   ins_encode( OpcP, RegMem( src, dst ) );
9059   ins_pipe( ialu_mem_reg );
9060 %}
9061 
9062 // Or Memory with Immediate
9063 instruct orI_mem_imm(memory dst, immI src, eFlagsReg cr) %{
9064   match(Set dst (StoreI dst (OrI (LoadI dst) src)));
9065   effect(KILL cr);
9066 
9067   ins_cost(125);
9068   format %{ "OR     $dst,$src" %}
9069   opcode(0x81,0x1);  /* Opcode 81 /1 id */
9070   // ins_encode( MemImm( dst, src) );
9071   ins_encode( OpcSE( src ), RMopc_Mem(secondary, dst ), Con8or32( src ) );
9072   ins_pipe( ialu_mem_imm );
9073 %}
9074 
9075 // ROL/ROR
9076 // ROL expand
9077 instruct rolI_eReg_imm1(eRegI dst, immI1 shift, eFlagsReg cr) %{
9078   effect(USE_DEF dst, USE shift, KILL cr);
9079 
9080   format %{ "ROL    $dst, $shift" %}
9081   opcode(0xD1, 0x0); /* Opcode D1 /0 */
9082   ins_encode( OpcP, RegOpc( dst ));
9083   ins_pipe( ialu_reg );
9084 %}
9085 
9086 instruct rolI_eReg_imm8(eRegI dst, immI8 shift, eFlagsReg cr) %{
9087   effect(USE_DEF dst, USE shift, KILL cr);
9088 
9089   format %{ "ROL    $dst, $shift" %}
9090   opcode(0xC1, 0x0); /*Opcode /C1  /0  */
9091   ins_encode( RegOpcImm(dst, shift) );
9092   ins_pipe(ialu_reg);
9093 %}
9094 
9095 instruct rolI_eReg_CL(ncxRegI dst, eCXRegI shift, eFlagsReg cr) %{
9096   effect(USE_DEF dst, USE shift, KILL cr);
9097 
9098   format %{ "ROL    $dst, $shift" %}
9099   opcode(0xD3, 0x0);    /* Opcode D3 /0 */
9100   ins_encode(OpcP, RegOpc(dst));
9101   ins_pipe( ialu_reg_reg );
9102 %}
9103 // end of ROL expand
9104 
9105 // ROL 32bit by one once
9106 instruct rolI_eReg_i1(eRegI dst, immI1 lshift, immI_M1 rshift, eFlagsReg cr) %{
9107   match(Set dst ( OrI (LShiftI dst lshift) (URShiftI dst rshift)));
9108 
9109   expand %{
9110     rolI_eReg_imm1(dst, lshift, cr);
9111   %}
9112 %}
9113 
9114 // ROL 32bit var by imm8 once
9115 instruct rolI_eReg_i8(eRegI dst, immI8 lshift, immI8 rshift, eFlagsReg cr) %{
9116   predicate(  0 == ((n->in(1)->in(2)->get_int() + n->in(2)->in(2)->get_int()) & 0x1f));
9117   match(Set dst ( OrI (LShiftI dst lshift) (URShiftI dst rshift)));
9118 
9119   expand %{
9120     rolI_eReg_imm8(dst, lshift, cr);
9121   %}
9122 %}
9123 
9124 // ROL 32bit var by var once
9125 instruct rolI_eReg_Var_C0(ncxRegI dst, eCXRegI shift, immI0 zero, eFlagsReg cr) %{
9126   match(Set dst ( OrI (LShiftI dst shift) (URShiftI dst (SubI zero shift))));
9127 
9128   expand %{
9129     rolI_eReg_CL(dst, shift, cr);
9130   %}
9131 %}
9132 
9133 // ROL 32bit var by var once
9134 instruct rolI_eReg_Var_C32(ncxRegI dst, eCXRegI shift, immI_32 c32, eFlagsReg cr) %{
9135   match(Set dst ( OrI (LShiftI dst shift) (URShiftI dst (SubI c32 shift))));
9136 
9137   expand %{
9138     rolI_eReg_CL(dst, shift, cr);
9139   %}
9140 %}
9141 
9142 // ROR expand
9143 instruct rorI_eReg_imm1(eRegI dst, immI1 shift, eFlagsReg cr) %{
9144   effect(USE_DEF dst, USE shift, KILL cr);
9145 
9146   format %{ "ROR    $dst, $shift" %}
9147   opcode(0xD1,0x1);  /* Opcode D1 /1 */
9148   ins_encode( OpcP, RegOpc( dst ) );
9149   ins_pipe( ialu_reg );
9150 %}
9151 
9152 instruct rorI_eReg_imm8(eRegI dst, immI8 shift, eFlagsReg cr) %{
9153   effect (USE_DEF dst, USE shift, KILL cr);
9154 
9155   format %{ "ROR    $dst, $shift" %}
9156   opcode(0xC1, 0x1); /* Opcode /C1 /1 ib */
9157   ins_encode( RegOpcImm(dst, shift) );
9158   ins_pipe( ialu_reg );
9159 %}
9160 
9161 instruct rorI_eReg_CL(ncxRegI dst, eCXRegI shift, eFlagsReg cr)%{
9162   effect(USE_DEF dst, USE shift, KILL cr);
9163 
9164   format %{ "ROR    $dst, $shift" %}
9165   opcode(0xD3, 0x1);    /* Opcode D3 /1 */
9166   ins_encode(OpcP, RegOpc(dst));
9167   ins_pipe( ialu_reg_reg );
9168 %}
9169 // end of ROR expand
9170 
9171 // ROR right once
9172 instruct rorI_eReg_i1(eRegI dst, immI1 rshift, immI_M1 lshift, eFlagsReg cr) %{
9173   match(Set dst ( OrI (URShiftI dst rshift) (LShiftI dst lshift)));
9174 
9175   expand %{
9176     rorI_eReg_imm1(dst, rshift, cr);
9177   %}
9178 %}
9179 
9180 // ROR 32bit by immI8 once
9181 instruct rorI_eReg_i8(eRegI dst, immI8 rshift, immI8 lshift, eFlagsReg cr) %{
9182   predicate(  0 == ((n->in(1)->in(2)->get_int() + n->in(2)->in(2)->get_int()) & 0x1f));
9183   match(Set dst ( OrI (URShiftI dst rshift) (LShiftI dst lshift)));
9184 
9185   expand %{
9186     rorI_eReg_imm8(dst, rshift, cr);
9187   %}
9188 %}
9189 
9190 // ROR 32bit var by var once
9191 instruct rorI_eReg_Var_C0(ncxRegI dst, eCXRegI shift, immI0 zero, eFlagsReg cr) %{
9192   match(Set dst ( OrI (URShiftI dst shift) (LShiftI dst (SubI zero shift))));
9193 
9194   expand %{
9195     rorI_eReg_CL(dst, shift, cr);
9196   %}
9197 %}
9198 
9199 // ROR 32bit var by var once
9200 instruct rorI_eReg_Var_C32(ncxRegI dst, eCXRegI shift, immI_32 c32, eFlagsReg cr) %{
9201   match(Set dst ( OrI (URShiftI dst shift) (LShiftI dst (SubI c32 shift))));
9202 
9203   expand %{
9204     rorI_eReg_CL(dst, shift, cr);
9205   %}
9206 %}
9207 
9208 // Xor Instructions
9209 // Xor Register with Register
9210 instruct xorI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
9211   match(Set dst (XorI dst src));
9212   effect(KILL cr);
9213 
9214   size(2);
9215   format %{ "XOR    $dst,$src" %}
9216   opcode(0x33);
9217   ins_encode( OpcP, RegReg( dst, src) );
9218   ins_pipe( ialu_reg_reg );
9219 %}
9220 
9221 // Xor Register with Immediate -1
9222 instruct xorI_eReg_im1(eRegI dst, immI_M1 imm) %{
9223   match(Set dst (XorI dst imm));  
9224 
9225   size(2);
9226   format %{ "NOT    $dst" %}  
9227   ins_encode %{
9228      __ notl($dst$$Register);
9229   %}
9230   ins_pipe( ialu_reg );
9231 %}
9232 
9233 // Xor Register with Immediate
9234 instruct xorI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
9235   match(Set dst (XorI dst src));
9236   effect(KILL cr);
9237 
9238   format %{ "XOR    $dst,$src" %}
9239   opcode(0x81,0x06);  /* Opcode 81 /6 id */
9240   // ins_encode( RegImm( dst, src) );
9241   ins_encode( OpcSErm( dst, src ), Con8or32( src ) );
9242   ins_pipe( ialu_reg );
9243 %}
9244 
9245 // Xor Register with Memory
9246 instruct xorI_eReg_mem(eRegI dst, memory src, eFlagsReg cr) %{
9247   match(Set dst (XorI dst (LoadI src)));
9248   effect(KILL cr);
9249 
9250   ins_cost(125);
9251   format %{ "XOR    $dst,$src" %}
9252   opcode(0x33);
9253   ins_encode( OpcP, RegMem(dst, src) );
9254   ins_pipe( ialu_reg_mem );
9255 %}
9256 
9257 // Xor Memory with Register
9258 instruct xorI_mem_eReg(memory dst, eRegI src, eFlagsReg cr) %{
9259   match(Set dst (StoreI dst (XorI (LoadI dst) src)));
9260   effect(KILL cr);
9261 
9262   ins_cost(150);
9263   format %{ "XOR    $dst,$src" %}
9264   opcode(0x31);  /* Opcode 31 /r */
9265   ins_encode( OpcP, RegMem( src, dst ) );
9266   ins_pipe( ialu_mem_reg );
9267 %}
9268 
9269 // Xor Memory with Immediate
9270 instruct xorI_mem_imm(memory dst, immI src, eFlagsReg cr) %{
9271   match(Set dst (StoreI dst (XorI (LoadI dst) src)));
9272   effect(KILL cr);
9273 
9274   ins_cost(125);
9275   format %{ "XOR    $dst,$src" %}
9276   opcode(0x81,0x6);  /* Opcode 81 /6 id */
9277   ins_encode( OpcSE( src ), RMopc_Mem(secondary, dst ), Con8or32( src ) );
9278   ins_pipe( ialu_mem_imm );
9279 %}
9280 
9281 //----------Convert Int to Boolean---------------------------------------------
9282 
9283 instruct movI_nocopy(eRegI dst, eRegI src) %{
9284   effect( DEF dst, USE src );
9285   format %{ "MOV    $dst,$src" %}
9286   ins_encode( enc_Copy( dst, src) );
9287   ins_pipe( ialu_reg_reg );
9288 %}
9289 
9290 instruct ci2b( eRegI dst, eRegI src, eFlagsReg cr ) %{
9291   effect( USE_DEF dst, USE src, KILL cr );
9292 
9293   size(4);
9294   format %{ "NEG    $dst\n\t"
9295             "ADC    $dst,$src" %}
9296   ins_encode( neg_reg(dst),
9297               OpcRegReg(0x13,dst,src) );
9298   ins_pipe( ialu_reg_reg_long );
9299 %}
9300 
9301 instruct convI2B( eRegI dst, eRegI src, eFlagsReg cr ) %{
9302   match(Set dst (Conv2B src));
9303 
9304   expand %{
9305     movI_nocopy(dst,src);
9306     ci2b(dst,src,cr);
9307   %}
9308 %}
9309 
9310 instruct movP_nocopy(eRegI dst, eRegP src) %{
9311   effect( DEF dst, USE src );
9312   format %{ "MOV    $dst,$src" %}
9313   ins_encode( enc_Copy( dst, src) );
9314   ins_pipe( ialu_reg_reg );
9315 %}
9316 
9317 instruct cp2b( eRegI dst, eRegP src, eFlagsReg cr ) %{
9318   effect( USE_DEF dst, USE src, KILL cr );
9319   format %{ "NEG    $dst\n\t"
9320             "ADC    $dst,$src" %}
9321   ins_encode( neg_reg(dst),
9322               OpcRegReg(0x13,dst,src) );
9323   ins_pipe( ialu_reg_reg_long );
9324 %}
9325 
9326 instruct convP2B( eRegI dst, eRegP src, eFlagsReg cr ) %{
9327   match(Set dst (Conv2B src));
9328 
9329   expand %{
9330     movP_nocopy(dst,src);
9331     cp2b(dst,src,cr);
9332   %}
9333 %}
9334 
9335 instruct cmpLTMask( eCXRegI dst, ncxRegI p, ncxRegI q, eFlagsReg cr ) %{
9336   match(Set dst (CmpLTMask p q));
9337   effect( KILL cr );
9338   ins_cost(400);
9339 
9340   // SETlt can only use low byte of EAX,EBX, ECX, or EDX as destination
9341   format %{ "XOR    $dst,$dst\n\t"
9342             "CMP    $p,$q\n\t"
9343             "SETlt  $dst\n\t"
9344             "NEG    $dst" %}
9345   ins_encode( OpcRegReg(0x33,dst,dst),
9346               OpcRegReg(0x3B,p,q),
9347               setLT_reg(dst), neg_reg(dst) );
9348   ins_pipe( pipe_slow );
9349 %}
9350 
9351 instruct cmpLTMask0( eRegI dst, immI0 zero, eFlagsReg cr ) %{
9352   match(Set dst (CmpLTMask dst zero));
9353   effect( DEF dst, KILL cr );
9354   ins_cost(100);
9355 
9356   format %{ "SAR    $dst,31" %}
9357   opcode(0xC1, 0x7);  /* C1 /7 ib */
9358   ins_encode( RegOpcImm( dst, 0x1F ) );
9359   ins_pipe( ialu_reg );
9360 %}
9361 
9362 
9363 instruct cadd_cmpLTMask( ncxRegI p, ncxRegI q, ncxRegI y, eCXRegI tmp, eFlagsReg cr ) %{
9364   match(Set p (AddI (AndI (CmpLTMask p q) y) (SubI p q)));
9365   effect( KILL tmp, KILL cr );
9366   ins_cost(400);
9367   // annoyingly, $tmp has no edges so you cant ask for it in
9368   // any format or encoding
9369   format %{ "SUB    $p,$q\n\t"
9370             "SBB    ECX,ECX\n\t"
9371             "AND    ECX,$y\n\t"
9372             "ADD    $p,ECX" %}
9373   ins_encode( enc_cmpLTP(p,q,y,tmp) );
9374   ins_pipe( pipe_cmplt );
9375 %}
9376 
9377 /* If I enable this, I encourage spilling in the inner loop of compress.
9378 instruct cadd_cmpLTMask_mem( ncxRegI p, ncxRegI q, memory y, eCXRegI tmp, eFlagsReg cr ) %{
9379   match(Set p (AddI (AndI (CmpLTMask p q) (LoadI y)) (SubI p q)));
9380   effect( USE_KILL tmp, KILL cr );
9381   ins_cost(400);
9382 
9383   format %{ "SUB    $p,$q\n\t"
9384             "SBB    ECX,ECX\n\t"
9385             "AND    ECX,$y\n\t"
9386             "ADD    $p,ECX" %}
9387   ins_encode( enc_cmpLTP_mem(p,q,y,tmp) );
9388 %}
9389 */
9390 
9391 //----------Long Instructions------------------------------------------------
9392 // Add Long Register with Register
9393 instruct addL_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
9394   match(Set dst (AddL dst src));
9395   effect(KILL cr);
9396   ins_cost(200);
9397   format %{ "ADD    $dst.lo,$src.lo\n\t"
9398             "ADC    $dst.hi,$src.hi" %}
9399   opcode(0x03, 0x13);
9400   ins_encode( RegReg_Lo(dst, src), RegReg_Hi(dst,src) );
9401   ins_pipe( ialu_reg_reg_long );
9402 %}
9403 
9404 // Add Long Register with Immediate
9405 instruct addL_eReg_imm(eRegL dst, immL src, eFlagsReg cr) %{
9406   match(Set dst (AddL dst src));
9407   effect(KILL cr);
9408   format %{ "ADD    $dst.lo,$src.lo\n\t"
9409             "ADC    $dst.hi,$src.hi" %}
9410   opcode(0x81,0x00,0x02);  /* Opcode 81 /0, 81 /2 */
9411   ins_encode( Long_OpcSErm_Lo( dst, src ), Long_OpcSErm_Hi( dst, src ) );
9412   ins_pipe( ialu_reg_long );
9413 %}
9414 
9415 // Add Long Register with Memory
9416 instruct addL_eReg_mem(eRegL dst, load_long_memory mem, eFlagsReg cr) %{
9417   match(Set dst (AddL dst (LoadL mem)));
9418   effect(KILL cr);
9419   ins_cost(125);
9420   format %{ "ADD    $dst.lo,$mem\n\t"
9421             "ADC    $dst.hi,$mem+4" %}
9422   opcode(0x03, 0x13);
9423   ins_encode( OpcP, RegMem( dst, mem), OpcS, RegMem_Hi(dst,mem) );
9424   ins_pipe( ialu_reg_long_mem );
9425 %}
9426 
9427 // Subtract Long Register with Register.
9428 instruct subL_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
9429   match(Set dst (SubL dst src));
9430   effect(KILL cr);
9431   ins_cost(200);
9432   format %{ "SUB    $dst.lo,$src.lo\n\t"
9433             "SBB    $dst.hi,$src.hi" %}
9434   opcode(0x2B, 0x1B);
9435   ins_encode( RegReg_Lo(dst, src), RegReg_Hi(dst,src) );
9436   ins_pipe( ialu_reg_reg_long );
9437 %}
9438 
9439 // Subtract Long Register with Immediate
9440 instruct subL_eReg_imm(eRegL dst, immL src, eFlagsReg cr) %{
9441   match(Set dst (SubL dst src));
9442   effect(KILL cr);
9443   format %{ "SUB    $dst.lo,$src.lo\n\t"
9444             "SBB    $dst.hi,$src.hi" %}
9445   opcode(0x81,0x05,0x03);  /* Opcode 81 /5, 81 /3 */
9446   ins_encode( Long_OpcSErm_Lo( dst, src ), Long_OpcSErm_Hi( dst, src ) );
9447   ins_pipe( ialu_reg_long );
9448 %}
9449 
9450 // Subtract Long Register with Memory
9451 instruct subL_eReg_mem(eRegL dst, load_long_memory mem, eFlagsReg cr) %{
9452   match(Set dst (SubL dst (LoadL mem)));
9453   effect(KILL cr);
9454   ins_cost(125);
9455   format %{ "SUB    $dst.lo,$mem\n\t"
9456             "SBB    $dst.hi,$mem+4" %}
9457   opcode(0x2B, 0x1B);
9458   ins_encode( OpcP, RegMem( dst, mem), OpcS, RegMem_Hi(dst,mem) );
9459   ins_pipe( ialu_reg_long_mem );
9460 %}
9461 
9462 instruct negL_eReg(eRegL dst, immL0 zero, eFlagsReg cr) %{
9463   match(Set dst (SubL zero dst));
9464   effect(KILL cr);
9465   ins_cost(300);
9466   format %{ "NEG    $dst.hi\n\tNEG    $dst.lo\n\tSBB    $dst.hi,0" %}
9467   ins_encode( neg_long(dst) );
9468   ins_pipe( ialu_reg_reg_long );
9469 %}
9470 
9471 // And Long Register with Register
9472 instruct andL_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
9473   match(Set dst (AndL dst src));
9474   effect(KILL cr);
9475   format %{ "AND    $dst.lo,$src.lo\n\t"
9476             "AND    $dst.hi,$src.hi" %}
9477   opcode(0x23,0x23);
9478   ins_encode( RegReg_Lo( dst, src), RegReg_Hi( dst, src) );
9479   ins_pipe( ialu_reg_reg_long );
9480 %}
9481 
9482 // And Long Register with Immediate
9483 instruct andL_eReg_imm(eRegL dst, immL src, eFlagsReg cr) %{
9484   match(Set dst (AndL dst src));
9485   effect(KILL cr);
9486   format %{ "AND    $dst.lo,$src.lo\n\t"
9487             "AND    $dst.hi,$src.hi" %}
9488   opcode(0x81,0x04,0x04);  /* Opcode 81 /4, 81 /4 */
9489   ins_encode( Long_OpcSErm_Lo( dst, src ), Long_OpcSErm_Hi( dst, src ) );
9490   ins_pipe( ialu_reg_long );
9491 %}
9492 
9493 // And Long Register with Memory
9494 instruct andL_eReg_mem(eRegL dst, load_long_memory mem, eFlagsReg cr) %{
9495   match(Set dst (AndL dst (LoadL mem)));
9496   effect(KILL cr);
9497   ins_cost(125);
9498   format %{ "AND    $dst.lo,$mem\n\t"
9499             "AND    $dst.hi,$mem+4" %}
9500   opcode(0x23, 0x23);
9501   ins_encode( OpcP, RegMem( dst, mem), OpcS, RegMem_Hi(dst,mem) );
9502   ins_pipe( ialu_reg_long_mem );
9503 %}
9504 
9505 // Or Long Register with Register
9506 instruct orl_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
9507   match(Set dst (OrL dst src));
9508   effect(KILL cr);
9509   format %{ "OR     $dst.lo,$src.lo\n\t"
9510             "OR     $dst.hi,$src.hi" %}
9511   opcode(0x0B,0x0B);
9512   ins_encode( RegReg_Lo( dst, src), RegReg_Hi( dst, src) );
9513   ins_pipe( ialu_reg_reg_long );
9514 %}
9515 
9516 // Or Long Register with Immediate
9517 instruct orl_eReg_imm(eRegL dst, immL src, eFlagsReg cr) %{
9518   match(Set dst (OrL dst src));
9519   effect(KILL cr);
9520   format %{ "OR     $dst.lo,$src.lo\n\t"
9521             "OR     $dst.hi,$src.hi" %}
9522   opcode(0x81,0x01,0x01);  /* Opcode 81 /1, 81 /1 */
9523   ins_encode( Long_OpcSErm_Lo( dst, src ), Long_OpcSErm_Hi( dst, src ) );
9524   ins_pipe( ialu_reg_long );
9525 %}
9526 
9527 // Or Long Register with Memory
9528 instruct orl_eReg_mem(eRegL dst, load_long_memory mem, eFlagsReg cr) %{
9529   match(Set dst (OrL dst (LoadL mem)));
9530   effect(KILL cr);
9531   ins_cost(125);
9532   format %{ "OR     $dst.lo,$mem\n\t"
9533             "OR     $dst.hi,$mem+4" %}
9534   opcode(0x0B,0x0B);
9535   ins_encode( OpcP, RegMem( dst, mem), OpcS, RegMem_Hi(dst,mem) );
9536   ins_pipe( ialu_reg_long_mem );
9537 %}
9538 
9539 // Xor Long Register with Register
9540 instruct xorl_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
9541   match(Set dst (XorL dst src));
9542   effect(KILL cr);
9543   format %{ "XOR    $dst.lo,$src.lo\n\t"
9544             "XOR    $dst.hi,$src.hi" %}
9545   opcode(0x33,0x33);
9546   ins_encode( RegReg_Lo( dst, src), RegReg_Hi( dst, src) );
9547   ins_pipe( ialu_reg_reg_long );
9548 %}
9549 
9550 // Xor Long Register with Immediate -1
9551 instruct xorl_eReg_im1(eRegL dst, immL_M1 imm) %{
9552   match(Set dst (XorL dst imm));  
9553   format %{ "NOT    $dst.lo\n\t"
9554             "NOT    $dst.hi" %}
9555   ins_encode %{
9556      __ notl($dst$$Register);
9557      __ notl(HIGH_FROM_LOW($dst$$Register));
9558   %}
9559   ins_pipe( ialu_reg_long );
9560 %}
9561 
9562 // Xor Long Register with Immediate
9563 instruct xorl_eReg_imm(eRegL dst, immL src, eFlagsReg cr) %{
9564   match(Set dst (XorL dst src));
9565   effect(KILL cr);
9566   format %{ "XOR    $dst.lo,$src.lo\n\t"
9567             "XOR    $dst.hi,$src.hi" %}
9568   opcode(0x81,0x06,0x06);  /* Opcode 81 /6, 81 /6 */
9569   ins_encode( Long_OpcSErm_Lo( dst, src ), Long_OpcSErm_Hi( dst, src ) );
9570   ins_pipe( ialu_reg_long );
9571 %}
9572 
9573 // Xor Long Register with Memory
9574 instruct xorl_eReg_mem(eRegL dst, load_long_memory mem, eFlagsReg cr) %{
9575   match(Set dst (XorL dst (LoadL mem)));
9576   effect(KILL cr);
9577   ins_cost(125);
9578   format %{ "XOR    $dst.lo,$mem\n\t"
9579             "XOR    $dst.hi,$mem+4" %}
9580   opcode(0x33,0x33);
9581   ins_encode( OpcP, RegMem( dst, mem), OpcS, RegMem_Hi(dst,mem) );
9582   ins_pipe( ialu_reg_long_mem );
9583 %}
9584 
9585 // Shift Left Long by 1
9586 instruct shlL_eReg_1(eRegL dst, immI_1 cnt, eFlagsReg cr) %{
9587   predicate(UseNewLongLShift);
9588   match(Set dst (LShiftL dst cnt));
9589   effect(KILL cr);
9590   ins_cost(100);
9591   format %{ "ADD    $dst.lo,$dst.lo\n\t"
9592             "ADC    $dst.hi,$dst.hi" %}
9593   ins_encode %{
9594     __ addl($dst$$Register,$dst$$Register);
9595     __ adcl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register));
9596   %}
9597   ins_pipe( ialu_reg_long );
9598 %}
9599 
9600 // Shift Left Long by 2
9601 instruct shlL_eReg_2(eRegL dst, immI_2 cnt, eFlagsReg cr) %{
9602   predicate(UseNewLongLShift);
9603   match(Set dst (LShiftL dst cnt));
9604   effect(KILL cr);
9605   ins_cost(100);
9606   format %{ "ADD    $dst.lo,$dst.lo\n\t"
9607             "ADC    $dst.hi,$dst.hi\n\t" 
9608             "ADD    $dst.lo,$dst.lo\n\t"
9609             "ADC    $dst.hi,$dst.hi" %}
9610   ins_encode %{
9611     __ addl($dst$$Register,$dst$$Register);
9612     __ adcl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register));
9613     __ addl($dst$$Register,$dst$$Register);
9614     __ adcl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register));
9615   %}
9616   ins_pipe( ialu_reg_long );
9617 %}
9618 
9619 // Shift Left Long by 3
9620 instruct shlL_eReg_3(eRegL dst, immI_3 cnt, eFlagsReg cr) %{
9621   predicate(UseNewLongLShift);
9622   match(Set dst (LShiftL dst cnt));
9623   effect(KILL cr);
9624   ins_cost(100);
9625   format %{ "ADD    $dst.lo,$dst.lo\n\t"
9626             "ADC    $dst.hi,$dst.hi\n\t" 
9627             "ADD    $dst.lo,$dst.lo\n\t"
9628             "ADC    $dst.hi,$dst.hi\n\t" 
9629             "ADD    $dst.lo,$dst.lo\n\t"
9630             "ADC    $dst.hi,$dst.hi" %}
9631   ins_encode %{
9632     __ addl($dst$$Register,$dst$$Register);
9633     __ adcl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register));
9634     __ addl($dst$$Register,$dst$$Register);
9635     __ adcl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register));
9636     __ addl($dst$$Register,$dst$$Register);
9637     __ adcl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register));
9638   %}
9639   ins_pipe( ialu_reg_long );
9640 %}
9641 
9642 // Shift Left Long by 1-31
9643 instruct shlL_eReg_1_31(eRegL dst, immI_1_31 cnt, eFlagsReg cr) %{
9644   match(Set dst (LShiftL dst cnt));
9645   effect(KILL cr);
9646   ins_cost(200);
9647   format %{ "SHLD   $dst.hi,$dst.lo,$cnt\n\t"
9648             "SHL    $dst.lo,$cnt" %}
9649   opcode(0xC1, 0x4, 0xA4);  /* 0F/A4, then C1 /4 ib */
9650   ins_encode( move_long_small_shift(dst,cnt) );
9651   ins_pipe( ialu_reg_long );
9652 %}
9653 
9654 // Shift Left Long by 32-63
9655 instruct shlL_eReg_32_63(eRegL dst, immI_32_63 cnt, eFlagsReg cr) %{
9656   match(Set dst (LShiftL dst cnt));
9657   effect(KILL cr);
9658   ins_cost(300);
9659   format %{ "MOV    $dst.hi,$dst.lo\n"
9660           "\tSHL    $dst.hi,$cnt-32\n"
9661           "\tXOR    $dst.lo,$dst.lo" %}
9662   opcode(0xC1, 0x4);  /* C1 /4 ib */
9663   ins_encode( move_long_big_shift_clr(dst,cnt) );
9664   ins_pipe( ialu_reg_long );
9665 %}
9666 
9667 // Shift Left Long by variable
9668 instruct salL_eReg_CL(eRegL dst, eCXRegI shift, eFlagsReg cr) %{
9669   match(Set dst (LShiftL dst shift));
9670   effect(KILL cr);
9671   ins_cost(500+200);
9672   size(17);
9673   format %{ "TEST   $shift,32\n\t"
9674             "JEQ,s  small\n\t"
9675             "MOV    $dst.hi,$dst.lo\n\t"
9676             "XOR    $dst.lo,$dst.lo\n"
9677     "small:\tSHLD   $dst.hi,$dst.lo,$shift\n\t"
9678             "SHL    $dst.lo,$shift" %}
9679   ins_encode( shift_left_long( dst, shift ) );
9680   ins_pipe( pipe_slow );
9681 %}
9682 
9683 // Shift Right Long by 1-31
9684 instruct shrL_eReg_1_31(eRegL dst, immI_1_31 cnt, eFlagsReg cr) %{
9685   match(Set dst (URShiftL dst cnt));
9686   effect(KILL cr);
9687   ins_cost(200);
9688   format %{ "SHRD   $dst.lo,$dst.hi,$cnt\n\t"
9689             "SHR    $dst.hi,$cnt" %}
9690   opcode(0xC1, 0x5, 0xAC);  /* 0F/AC, then C1 /5 ib */
9691   ins_encode( move_long_small_shift(dst,cnt) );
9692   ins_pipe( ialu_reg_long );
9693 %}
9694 
9695 // Shift Right Long by 32-63
9696 instruct shrL_eReg_32_63(eRegL dst, immI_32_63 cnt, eFlagsReg cr) %{
9697   match(Set dst (URShiftL dst cnt));
9698   effect(KILL cr);
9699   ins_cost(300);
9700   format %{ "MOV    $dst.lo,$dst.hi\n"
9701           "\tSHR    $dst.lo,$cnt-32\n"
9702           "\tXOR    $dst.hi,$dst.hi" %}
9703   opcode(0xC1, 0x5);  /* C1 /5 ib */
9704   ins_encode( move_long_big_shift_clr(dst,cnt) );
9705   ins_pipe( ialu_reg_long );
9706 %}
9707 
9708 // Shift Right Long by variable
9709 instruct shrL_eReg_CL(eRegL dst, eCXRegI shift, eFlagsReg cr) %{
9710   match(Set dst (URShiftL dst shift));
9711   effect(KILL cr);
9712   ins_cost(600);
9713   size(17);
9714   format %{ "TEST   $shift,32\n\t"
9715             "JEQ,s  small\n\t"
9716             "MOV    $dst.lo,$dst.hi\n\t"
9717             "XOR    $dst.hi,$dst.hi\n"
9718     "small:\tSHRD   $dst.lo,$dst.hi,$shift\n\t"
9719             "SHR    $dst.hi,$shift" %}
9720   ins_encode( shift_right_long( dst, shift ) );
9721   ins_pipe( pipe_slow );
9722 %}
9723 
9724 // Shift Right Long by 1-31
9725 instruct sarL_eReg_1_31(eRegL dst, immI_1_31 cnt, eFlagsReg cr) %{
9726   match(Set dst (RShiftL dst cnt));
9727   effect(KILL cr);
9728   ins_cost(200);
9729   format %{ "SHRD   $dst.lo,$dst.hi,$cnt\n\t"
9730             "SAR    $dst.hi,$cnt" %}
9731   opcode(0xC1, 0x7, 0xAC);  /* 0F/AC, then C1 /7 ib */
9732   ins_encode( move_long_small_shift(dst,cnt) );
9733   ins_pipe( ialu_reg_long );
9734 %}
9735 
9736 // Shift Right Long by 32-63
9737 instruct sarL_eReg_32_63( eRegL dst, immI_32_63 cnt, eFlagsReg cr) %{
9738   match(Set dst (RShiftL dst cnt));
9739   effect(KILL cr);
9740   ins_cost(300);
9741   format %{ "MOV    $dst.lo,$dst.hi\n"
9742           "\tSAR    $dst.lo,$cnt-32\n"
9743           "\tSAR    $dst.hi,31" %}
9744   opcode(0xC1, 0x7);  /* C1 /7 ib */
9745   ins_encode( move_long_big_shift_sign(dst,cnt) );
9746   ins_pipe( ialu_reg_long );
9747 %}
9748 
9749 // Shift Right arithmetic Long by variable
9750 instruct sarL_eReg_CL(eRegL dst, eCXRegI shift, eFlagsReg cr) %{
9751   match(Set dst (RShiftL dst shift));
9752   effect(KILL cr);
9753   ins_cost(600);
9754   size(18);
9755   format %{ "TEST   $shift,32\n\t"
9756             "JEQ,s  small\n\t"
9757             "MOV    $dst.lo,$dst.hi\n\t"
9758             "SAR    $dst.hi,31\n"
9759     "small:\tSHRD   $dst.lo,$dst.hi,$shift\n\t"
9760             "SAR    $dst.hi,$shift" %}
9761   ins_encode( shift_right_arith_long( dst, shift ) );
9762   ins_pipe( pipe_slow );
9763 %}
9764 
9765 
9766 //----------Double Instructions------------------------------------------------
9767 // Double Math
9768 
9769 // Compare & branch
9770 
9771 // P6 version of float compare, sets condition codes in EFLAGS
9772 instruct cmpD_cc_P6(eFlagsRegU cr, regD src1, regD src2, eAXRegI rax) %{
9773   predicate(VM_Version::supports_cmov() && UseSSE <=1);
9774   match(Set cr (CmpD src1 src2));
9775   effect(KILL rax);
9776   ins_cost(150);
9777   format %{ "FLD    $src1\n\t"
9778             "FUCOMIP ST,$src2  // P6 instruction\n\t"
9779             "JNP    exit\n\t"
9780             "MOV    ah,1       // saw a NaN, set CF\n\t"
9781             "SAHF\n"
9782      "exit:\tNOP               // avoid branch to branch" %}
9783   opcode(0xDF, 0x05); /* DF E8+i or DF /5 */
9784   ins_encode( Push_Reg_D(src1),
9785               OpcP, RegOpc(src2),
9786               cmpF_P6_fixup );
9787   ins_pipe( pipe_slow );
9788 %}
9789 
9790 instruct cmpD_cc_P6CF(eFlagsRegUCF cr, regD src1, regD src2) %{
9791   predicate(VM_Version::supports_cmov() && UseSSE <=1);
9792   match(Set cr (CmpD src1 src2));
9793   ins_cost(150);
9794   format %{ "FLD    $src1\n\t"
9795             "FUCOMIP ST,$src2  // P6 instruction" %}
9796   opcode(0xDF, 0x05); /* DF E8+i or DF /5 */
9797   ins_encode( Push_Reg_D(src1),
9798               OpcP, RegOpc(src2));
9799   ins_pipe( pipe_slow );
9800 %}
9801 
9802 // Compare & branch
9803 instruct cmpD_cc(eFlagsRegU cr, regD src1, regD src2, eAXRegI rax) %{
9804   predicate(UseSSE<=1);
9805   match(Set cr (CmpD src1 src2));
9806   effect(KILL rax);
9807   ins_cost(200);
9808   format %{ "FLD    $src1\n\t"
9809             "FCOMp  $src2\n\t"
9810             "FNSTSW AX\n\t"
9811             "TEST   AX,0x400\n\t"
9812             "JZ,s   flags\n\t"
9813             "MOV    AH,1\t# unordered treat as LT\n"
9814     "flags:\tSAHF" %}
9815   opcode(0xD8, 0x3); /* D8 D8+i or D8 /3 */
9816   ins_encode( Push_Reg_D(src1),
9817               OpcP, RegOpc(src2),
9818               fpu_flags);
9819   ins_pipe( pipe_slow );
9820 %}
9821 
9822 // Compare vs zero into -1,0,1
9823 instruct cmpD_0(eRegI dst, regD src1, immD0 zero, eAXRegI rax, eFlagsReg cr) %{
9824   predicate(UseSSE<=1);
9825   match(Set dst (CmpD3 src1 zero));
9826   effect(KILL cr, KILL rax);
9827   ins_cost(280);
9828   format %{ "FTSTD  $dst,$src1" %}
9829   opcode(0xE4, 0xD9);
9830   ins_encode( Push_Reg_D(src1),
9831               OpcS, OpcP, PopFPU,
9832               CmpF_Result(dst));
9833   ins_pipe( pipe_slow );
9834 %}
9835 
9836 // Compare into -1,0,1
9837 instruct cmpD_reg(eRegI dst, regD src1, regD src2, eAXRegI rax, eFlagsReg cr) %{
9838   predicate(UseSSE<=1);
9839   match(Set dst (CmpD3 src1 src2));
9840   effect(KILL cr, KILL rax);
9841   ins_cost(300);
9842   format %{ "FCMPD  $dst,$src1,$src2" %}
9843   opcode(0xD8, 0x3); /* D8 D8+i or D8 /3 */
9844   ins_encode( Push_Reg_D(src1),
9845               OpcP, RegOpc(src2),
9846               CmpF_Result(dst));
9847   ins_pipe( pipe_slow );
9848 %}
9849 
9850 // float compare and set condition codes in EFLAGS by XMM regs
9851 instruct cmpXD_cc(eFlagsRegU cr, regXD dst, regXD src, eAXRegI rax) %{
9852   predicate(UseSSE>=2);
9853   match(Set cr (CmpD dst src));
9854   effect(KILL rax);
9855   ins_cost(125);
9856   format %{ "COMISD $dst,$src\n"
9857           "\tJNP    exit\n"
9858           "\tMOV    ah,1       // saw a NaN, set CF\n"
9859           "\tSAHF\n"
9860      "exit:\tNOP               // avoid branch to branch" %}
9861   opcode(0x66, 0x0F, 0x2F);
9862   ins_encode(OpcP, OpcS, Opcode(tertiary), RegReg(dst, src), cmpF_P6_fixup);
9863   ins_pipe( pipe_slow );
9864 %}
9865 
9866 instruct cmpXD_ccCF(eFlagsRegUCF cr, regXD dst, regXD src) %{
9867   predicate(UseSSE>=2);
9868   match(Set cr (CmpD dst src));
9869   ins_cost(100);
9870   format %{ "COMISD $dst,$src" %}
9871   opcode(0x66, 0x0F, 0x2F);
9872   ins_encode(OpcP, OpcS, Opcode(tertiary), RegReg(dst, src));
9873   ins_pipe( pipe_slow );
9874 %}
9875 
9876 // float compare and set condition codes in EFLAGS by XMM regs
9877 instruct cmpXD_ccmem(eFlagsRegU cr, regXD dst, memory src, eAXRegI rax) %{
9878   predicate(UseSSE>=2);
9879   match(Set cr (CmpD dst (LoadD src)));
9880   effect(KILL rax);
9881   ins_cost(145);
9882   format %{ "COMISD $dst,$src\n"
9883           "\tJNP    exit\n"
9884           "\tMOV    ah,1       // saw a NaN, set CF\n"
9885           "\tSAHF\n"
9886      "exit:\tNOP               // avoid branch to branch" %}
9887   opcode(0x66, 0x0F, 0x2F);
9888   ins_encode(OpcP, OpcS, Opcode(tertiary), RegMem(dst, src), cmpF_P6_fixup);
9889   ins_pipe( pipe_slow );
9890 %}
9891 
9892 instruct cmpXD_ccmemCF(eFlagsRegUCF cr, regXD dst, memory src) %{
9893   predicate(UseSSE>=2);
9894   match(Set cr (CmpD dst (LoadD src)));
9895   ins_cost(100);
9896   format %{ "COMISD $dst,$src" %}
9897   opcode(0x66, 0x0F, 0x2F);
9898   ins_encode(OpcP, OpcS, Opcode(tertiary), RegMem(dst, src));
9899   ins_pipe( pipe_slow );
9900 %}
9901 
9902 // Compare into -1,0,1 in XMM
9903 instruct cmpXD_reg(eRegI dst, regXD src1, regXD src2, eFlagsReg cr) %{
9904   predicate(UseSSE>=2);
9905   match(Set dst (CmpD3 src1 src2));
9906   effect(KILL cr);
9907   ins_cost(255);
9908   format %{ "XOR    $dst,$dst\n"
9909           "\tCOMISD $src1,$src2\n"
9910           "\tJP,s   nan\n"
9911           "\tJEQ,s  exit\n"
9912           "\tJA,s   inc\n"
9913       "nan:\tDEC    $dst\n"
9914           "\tJMP,s  exit\n"
9915       "inc:\tINC    $dst\n"
9916       "exit:"
9917                 %}
9918   opcode(0x66, 0x0F, 0x2F);
9919   ins_encode(Xor_Reg(dst), OpcP, OpcS, Opcode(tertiary), RegReg(src1, src2),
9920              CmpX_Result(dst));
9921   ins_pipe( pipe_slow );
9922 %}
9923 
9924 // Compare into -1,0,1 in XMM and memory
9925 instruct cmpXD_regmem(eRegI dst, regXD src1, memory mem, eFlagsReg cr) %{
9926   predicate(UseSSE>=2);
9927   match(Set dst (CmpD3 src1 (LoadD mem)));
9928   effect(KILL cr);
9929   ins_cost(275);
9930   format %{ "COMISD $src1,$mem\n"
9931           "\tMOV    $dst,0\t\t# do not blow flags\n"
9932           "\tJP,s   nan\n"
9933           "\tJEQ,s  exit\n"
9934           "\tJA,s   inc\n"
9935       "nan:\tDEC    $dst\n"
9936           "\tJMP,s  exit\n"
9937       "inc:\tINC    $dst\n"
9938       "exit:"
9939                 %}
9940   opcode(0x66, 0x0F, 0x2F);
9941   ins_encode(OpcP, OpcS, Opcode(tertiary), RegMem(src1, mem),
9942              LdImmI(dst,0x0), CmpX_Result(dst));
9943   ins_pipe( pipe_slow );
9944 %}
9945 
9946 
9947 instruct subD_reg(regD dst, regD src) %{
9948   predicate (UseSSE <=1);
9949   match(Set dst (SubD dst src));
9950 
9951   format %{ "FLD    $src\n\t"
9952             "DSUBp  $dst,ST" %}
9953   opcode(0xDE, 0x5); /* DE E8+i  or DE /5 */
9954   ins_cost(150);
9955   ins_encode( Push_Reg_D(src),
9956               OpcP, RegOpc(dst) );
9957   ins_pipe( fpu_reg_reg );
9958 %}
9959 
9960 instruct subD_reg_round(stackSlotD dst, regD src1, regD src2) %{
9961   predicate (UseSSE <=1);
9962   match(Set dst (RoundDouble (SubD src1 src2)));
9963   ins_cost(250);
9964 
9965   format %{ "FLD    $src2\n\t"
9966             "DSUB   ST,$src1\n\t"
9967             "FSTP_D $dst\t# D-round" %}
9968   opcode(0xD8, 0x5);
9969   ins_encode( Push_Reg_D(src2),
9970               OpcP, RegOpc(src1), Pop_Mem_D(dst) );
9971   ins_pipe( fpu_mem_reg_reg );
9972 %}
9973 
9974 
9975 instruct subD_reg_mem(regD dst, memory src) %{
9976   predicate (UseSSE <=1);
9977   match(Set dst (SubD dst (LoadD src)));
9978   ins_cost(150);
9979 
9980   format %{ "FLD    $src\n\t"
9981             "DSUBp  $dst,ST" %}
9982   opcode(0xDE, 0x5, 0xDD); /* DE C0+i */  /* LoadD  DD /0 */
9983   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src),
9984               OpcP, RegOpc(dst) );
9985   ins_pipe( fpu_reg_mem );
9986 %}
9987 
9988 instruct absD_reg(regDPR1 dst, regDPR1 src) %{
9989   predicate (UseSSE<=1);
9990   match(Set dst (AbsD src));
9991   ins_cost(100);
9992   format %{ "FABS" %}
9993   opcode(0xE1, 0xD9);
9994   ins_encode( OpcS, OpcP );
9995   ins_pipe( fpu_reg_reg );
9996 %}
9997 
9998 instruct absXD_reg( regXD dst ) %{
9999   predicate(UseSSE>=2);
10000   match(Set dst (AbsD dst));
10001   format %{ "ANDPD  $dst,[0x7FFFFFFFFFFFFFFF]\t# ABS D by sign masking" %}
10002   ins_encode( AbsXD_encoding(dst));
10003   ins_pipe( pipe_slow );
10004 %}
10005 
10006 instruct negD_reg(regDPR1 dst, regDPR1 src) %{
10007   predicate(UseSSE<=1);
10008   match(Set dst (NegD src));
10009   ins_cost(100);
10010   format %{ "FCHS" %}
10011   opcode(0xE0, 0xD9);
10012   ins_encode( OpcS, OpcP );
10013   ins_pipe( fpu_reg_reg );
10014 %}
10015 
10016 instruct negXD_reg( regXD dst ) %{
10017   predicate(UseSSE>=2);
10018   match(Set dst (NegD dst));
10019   format %{ "XORPD  $dst,[0x8000000000000000]\t# CHS D by sign flipping" %}
10020   ins_encode %{
10021      __ xorpd($dst$$XMMRegister,
10022               ExternalAddress((address)double_signflip_pool));
10023   %}
10024   ins_pipe( pipe_slow );
10025 %}
10026 
10027 instruct addD_reg(regD dst, regD src) %{
10028   predicate(UseSSE<=1);
10029   match(Set dst (AddD dst src));
10030   format %{ "FLD    $src\n\t"
10031             "DADD   $dst,ST" %}
10032   size(4);
10033   ins_cost(150);
10034   opcode(0xDE, 0x0); /* DE C0+i or DE /0*/
10035   ins_encode( Push_Reg_D(src),
10036               OpcP, RegOpc(dst) );
10037   ins_pipe( fpu_reg_reg );
10038 %}
10039 
10040 
10041 instruct addD_reg_round(stackSlotD dst, regD src1, regD src2) %{
10042   predicate(UseSSE<=1);
10043   match(Set dst (RoundDouble (AddD src1 src2)));
10044   ins_cost(250);
10045 
10046   format %{ "FLD    $src2\n\t"
10047             "DADD   ST,$src1\n\t"
10048             "FSTP_D $dst\t# D-round" %}
10049   opcode(0xD8, 0x0); /* D8 C0+i or D8 /0*/
10050   ins_encode( Push_Reg_D(src2),
10051               OpcP, RegOpc(src1), Pop_Mem_D(dst) );
10052   ins_pipe( fpu_mem_reg_reg );
10053 %}
10054 
10055 
10056 instruct addD_reg_mem(regD dst, memory src) %{
10057   predicate(UseSSE<=1);
10058   match(Set dst (AddD dst (LoadD src)));
10059   ins_cost(150);
10060 
10061   format %{ "FLD    $src\n\t"
10062             "DADDp  $dst,ST" %}
10063   opcode(0xDE, 0x0, 0xDD); /* DE C0+i */  /* LoadD  DD /0 */
10064   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src),
10065               OpcP, RegOpc(dst) );
10066   ins_pipe( fpu_reg_mem );
10067 %}
10068 
10069 // add-to-memory
10070 instruct addD_mem_reg(memory dst, regD src) %{
10071   predicate(UseSSE<=1);
10072   match(Set dst (StoreD dst (RoundDouble (AddD (LoadD dst) src))));
10073   ins_cost(150);
10074 
10075   format %{ "FLD_D  $dst\n\t"
10076             "DADD   ST,$src\n\t"
10077             "FST_D  $dst" %}
10078   opcode(0xDD, 0x0);
10079   ins_encode( Opcode(0xDD), RMopc_Mem(0x00,dst),
10080               Opcode(0xD8), RegOpc(src),
10081               set_instruction_start,
10082               Opcode(0xDD), RMopc_Mem(0x03,dst) );
10083   ins_pipe( fpu_reg_mem );
10084 %}
10085 
10086 instruct addD_reg_imm1(regD dst, immD1 src) %{
10087   predicate(UseSSE<=1);
10088   match(Set dst (AddD dst src));
10089   ins_cost(125);
10090   format %{ "FLD1\n\t"
10091             "DADDp  $dst,ST" %}
10092   opcode(0xDE, 0x00);
10093   ins_encode( LdImmD(src),
10094               OpcP, RegOpc(dst) );
10095   ins_pipe( fpu_reg );
10096 %}
10097 
10098 instruct addD_reg_imm(regD dst, immD src) %{
10099   predicate(UseSSE<=1 && _kids[1]->_leaf->getd() != 0.0 && _kids[1]->_leaf->getd() != 1.0 );
10100   match(Set dst (AddD dst src));
10101   ins_cost(200);
10102   format %{ "FLD_D  [$src]\n\t"
10103             "DADDp  $dst,ST" %}
10104   opcode(0xDE, 0x00);       /* DE /0 */
10105   ins_encode( LdImmD(src),
10106               OpcP, RegOpc(dst));
10107   ins_pipe( fpu_reg_mem );
10108 %}
10109 
10110 instruct addD_reg_imm_round(stackSlotD dst, regD src, immD con) %{
10111   predicate(UseSSE<=1 && _kids[0]->_kids[1]->_leaf->getd() != 0.0 && _kids[0]->_kids[1]->_leaf->getd() != 1.0 );
10112   match(Set dst (RoundDouble (AddD src con)));
10113   ins_cost(200);
10114   format %{ "FLD_D  [$con]\n\t"
10115             "DADD   ST,$src\n\t"
10116             "FSTP_D $dst\t# D-round" %}
10117   opcode(0xD8, 0x00);       /* D8 /0 */
10118   ins_encode( LdImmD(con),
10119               OpcP, RegOpc(src), Pop_Mem_D(dst));
10120   ins_pipe( fpu_mem_reg_con );
10121 %}
10122 
10123 // Add two double precision floating point values in xmm
10124 instruct addXD_reg(regXD dst, regXD src) %{
10125   predicate(UseSSE>=2);
10126   match(Set dst (AddD dst src));
10127   format %{ "ADDSD  $dst,$src" %}
10128   ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x58), RegReg(dst, src));
10129   ins_pipe( pipe_slow );
10130 %}
10131 
10132 instruct addXD_imm(regXD dst, immXD con) %{
10133   predicate(UseSSE>=2);
10134   match(Set dst (AddD dst con));
10135   format %{ "ADDSD  $dst,[$con]" %}
10136   ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x58), LdImmXD(dst, con) );
10137   ins_pipe( pipe_slow );
10138 %}
10139 
10140 instruct addXD_mem(regXD dst, memory mem) %{
10141   predicate(UseSSE>=2);
10142   match(Set dst (AddD dst (LoadD mem)));
10143   format %{ "ADDSD  $dst,$mem" %}
10144   ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x58), RegMem(dst,mem));
10145   ins_pipe( pipe_slow );
10146 %}
10147 
10148 // Sub two double precision floating point values in xmm
10149 instruct subXD_reg(regXD dst, regXD src) %{
10150   predicate(UseSSE>=2);
10151   match(Set dst (SubD dst src));
10152   format %{ "SUBSD  $dst,$src" %}
10153   ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x5C), RegReg(dst, src));
10154   ins_pipe( pipe_slow );
10155 %}
10156 
10157 instruct subXD_imm(regXD dst, immXD con) %{
10158   predicate(UseSSE>=2);
10159   match(Set dst (SubD dst con));
10160   format %{ "SUBSD  $dst,[$con]" %}
10161   ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x5C), LdImmXD(dst, con) );
10162   ins_pipe( pipe_slow );
10163 %}
10164 
10165 instruct subXD_mem(regXD dst, memory mem) %{
10166   predicate(UseSSE>=2);
10167   match(Set dst (SubD dst (LoadD mem)));
10168   format %{ "SUBSD  $dst,$mem" %}
10169   ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x5C), RegMem(dst,mem));
10170   ins_pipe( pipe_slow );
10171 %}
10172 
10173 // Mul two double precision floating point values in xmm
10174 instruct mulXD_reg(regXD dst, regXD src) %{
10175   predicate(UseSSE>=2);
10176   match(Set dst (MulD dst src));
10177   format %{ "MULSD  $dst,$src" %}
10178   ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x59), RegReg(dst, src));
10179   ins_pipe( pipe_slow );
10180 %}
10181 
10182 instruct mulXD_imm(regXD dst, immXD con) %{
10183   predicate(UseSSE>=2);
10184   match(Set dst (MulD dst con));
10185   format %{ "MULSD  $dst,[$con]" %}
10186   ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x59), LdImmXD(dst, con) );
10187   ins_pipe( pipe_slow );
10188 %}
10189 
10190 instruct mulXD_mem(regXD dst, memory mem) %{
10191   predicate(UseSSE>=2);
10192   match(Set dst (MulD dst (LoadD mem)));
10193   format %{ "MULSD  $dst,$mem" %}
10194   ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x59), RegMem(dst,mem));
10195   ins_pipe( pipe_slow );
10196 %}
10197 
10198 // Div two double precision floating point values in xmm
10199 instruct divXD_reg(regXD dst, regXD src) %{
10200   predicate(UseSSE>=2);
10201   match(Set dst (DivD dst src));
10202   format %{ "DIVSD  $dst,$src" %}
10203   opcode(0xF2, 0x0F, 0x5E);
10204   ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x5E), RegReg(dst, src));
10205   ins_pipe( pipe_slow );
10206 %}
10207 
10208 instruct divXD_imm(regXD dst, immXD con) %{
10209   predicate(UseSSE>=2);
10210   match(Set dst (DivD dst con));
10211   format %{ "DIVSD  $dst,[$con]" %}
10212   ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x5E), LdImmXD(dst, con));
10213   ins_pipe( pipe_slow );
10214 %}
10215 
10216 instruct divXD_mem(regXD dst, memory mem) %{
10217   predicate(UseSSE>=2);
10218   match(Set dst (DivD dst (LoadD mem)));
10219   format %{ "DIVSD  $dst,$mem" %}
10220   ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x5E), RegMem(dst,mem));
10221   ins_pipe( pipe_slow );
10222 %}
10223 
10224 
10225 instruct mulD_reg(regD dst, regD src) %{
10226   predicate(UseSSE<=1);
10227   match(Set dst (MulD dst src));
10228   format %{ "FLD    $src\n\t"
10229             "DMULp  $dst,ST" %}
10230   opcode(0xDE, 0x1); /* DE C8+i or DE /1*/
10231   ins_cost(150);
10232   ins_encode( Push_Reg_D(src),
10233               OpcP, RegOpc(dst) );
10234   ins_pipe( fpu_reg_reg );
10235 %}
10236 
10237 // Strict FP instruction biases argument before multiply then
10238 // biases result to avoid double rounding of subnormals.
10239 //
10240 // scale arg1 by multiplying arg1 by 2^(-15360)
10241 // load arg2
10242 // multiply scaled arg1 by arg2
10243 // rescale product by 2^(15360)
10244 //
10245 instruct strictfp_mulD_reg(regDPR1 dst, regnotDPR1 src) %{
10246   predicate( UseSSE<=1 && Compile::current()->has_method() && Compile::current()->method()->is_strict() );
10247   match(Set dst (MulD dst src));
10248   ins_cost(1);   // Select this instruction for all strict FP double multiplies
10249 
10250   format %{ "FLD    StubRoutines::_fpu_subnormal_bias1\n\t"
10251             "DMULp  $dst,ST\n\t"
10252             "FLD    $src\n\t"
10253             "DMULp  $dst,ST\n\t"
10254             "FLD    StubRoutines::_fpu_subnormal_bias2\n\t"
10255             "DMULp  $dst,ST\n\t" %}
10256   opcode(0xDE, 0x1); /* DE C8+i or DE /1*/
10257   ins_encode( strictfp_bias1(dst),
10258               Push_Reg_D(src),
10259               OpcP, RegOpc(dst),
10260               strictfp_bias2(dst) );
10261   ins_pipe( fpu_reg_reg );
10262 %}
10263 
10264 instruct mulD_reg_imm(regD dst, immD src) %{
10265   predicate( UseSSE<=1 && _kids[1]->_leaf->getd() != 0.0 && _kids[1]->_leaf->getd() != 1.0 );
10266   match(Set dst (MulD dst src));
10267   ins_cost(200);
10268   format %{ "FLD_D  [$src]\n\t"
10269             "DMULp  $dst,ST" %}
10270   opcode(0xDE, 0x1); /* DE /1 */
10271   ins_encode( LdImmD(src),
10272               OpcP, RegOpc(dst) );
10273   ins_pipe( fpu_reg_mem );
10274 %}
10275 
10276 
10277 instruct mulD_reg_mem(regD dst, memory src) %{
10278   predicate( UseSSE<=1 );
10279   match(Set dst (MulD dst (LoadD src)));
10280   ins_cost(200);
10281   format %{ "FLD_D  $src\n\t"
10282             "DMULp  $dst,ST" %}
10283   opcode(0xDE, 0x1, 0xDD); /* DE C8+i or DE /1*/  /* LoadD  DD /0 */
10284   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src),
10285               OpcP, RegOpc(dst) );
10286   ins_pipe( fpu_reg_mem );
10287 %}
10288 
10289 //
10290 // Cisc-alternate to reg-reg multiply
10291 instruct mulD_reg_mem_cisc(regD dst, regD src, memory mem) %{
10292   predicate( UseSSE<=1 );
10293   match(Set dst (MulD src (LoadD mem)));
10294   ins_cost(250);
10295   format %{ "FLD_D  $mem\n\t"
10296             "DMUL   ST,$src\n\t"
10297             "FSTP_D $dst" %}
10298   opcode(0xD8, 0x1, 0xD9); /* D8 C8+i */  /* LoadD D9 /0 */
10299   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,mem),
10300               OpcReg_F(src),
10301               Pop_Reg_D(dst) );
10302   ins_pipe( fpu_reg_reg_mem );
10303 %}
10304 
10305 
10306 // MACRO3 -- addD a mulD
10307 // This instruction is a '2-address' instruction in that the result goes
10308 // back to src2.  This eliminates a move from the macro; possibly the
10309 // register allocator will have to add it back (and maybe not).
10310 instruct addD_mulD_reg(regD src2, regD src1, regD src0) %{
10311   predicate( UseSSE<=1 );
10312   match(Set src2 (AddD (MulD src0 src1) src2));
10313   format %{ "FLD    $src0\t# ===MACRO3d===\n\t"
10314             "DMUL   ST,$src1\n\t"
10315             "DADDp  $src2,ST" %}
10316   ins_cost(250);
10317   opcode(0xDD); /* LoadD DD /0 */
10318   ins_encode( Push_Reg_F(src0),
10319               FMul_ST_reg(src1),
10320               FAddP_reg_ST(src2) );
10321   ins_pipe( fpu_reg_reg_reg );
10322 %}
10323 
10324 
10325 // MACRO3 -- subD a mulD
10326 instruct subD_mulD_reg(regD src2, regD src1, regD src0) %{
10327   predicate( UseSSE<=1 );
10328   match(Set src2 (SubD (MulD src0 src1) src2));
10329   format %{ "FLD    $src0\t# ===MACRO3d===\n\t"
10330             "DMUL   ST,$src1\n\t"
10331             "DSUBRp $src2,ST" %}
10332   ins_cost(250);
10333   ins_encode( Push_Reg_F(src0),
10334               FMul_ST_reg(src1),
10335               Opcode(0xDE), Opc_plus(0xE0,src2));
10336   ins_pipe( fpu_reg_reg_reg );
10337 %}
10338 
10339 
10340 instruct divD_reg(regD dst, regD src) %{
10341   predicate( UseSSE<=1 );
10342   match(Set dst (DivD dst src));
10343 
10344   format %{ "FLD    $src\n\t"
10345             "FDIVp  $dst,ST" %}
10346   opcode(0xDE, 0x7); /* DE F8+i or DE /7*/
10347   ins_cost(150);
10348   ins_encode( Push_Reg_D(src),
10349               OpcP, RegOpc(dst) );
10350   ins_pipe( fpu_reg_reg );
10351 %}
10352 
10353 // Strict FP instruction biases argument before division then
10354 // biases result, to avoid double rounding of subnormals.
10355 //
10356 // scale dividend by multiplying dividend by 2^(-15360)
10357 // load divisor
10358 // divide scaled dividend by divisor
10359 // rescale quotient by 2^(15360)
10360 //
10361 instruct strictfp_divD_reg(regDPR1 dst, regnotDPR1 src) %{
10362   predicate (UseSSE<=1);
10363   match(Set dst (DivD dst src));
10364   predicate( UseSSE<=1 && Compile::current()->has_method() && Compile::current()->method()->is_strict() );
10365   ins_cost(01);
10366 
10367   format %{ "FLD    StubRoutines::_fpu_subnormal_bias1\n\t"
10368             "DMULp  $dst,ST\n\t"
10369             "FLD    $src\n\t"
10370             "FDIVp  $dst,ST\n\t"
10371             "FLD    StubRoutines::_fpu_subnormal_bias2\n\t"
10372             "DMULp  $dst,ST\n\t" %}
10373   opcode(0xDE, 0x7); /* DE F8+i or DE /7*/
10374   ins_encode( strictfp_bias1(dst),
10375               Push_Reg_D(src),
10376               OpcP, RegOpc(dst),
10377               strictfp_bias2(dst) );
10378   ins_pipe( fpu_reg_reg );
10379 %}
10380 
10381 instruct divD_reg_round(stackSlotD dst, regD src1, regD src2) %{
10382   predicate( UseSSE<=1 && !(Compile::current()->has_method() && Compile::current()->method()->is_strict()) );
10383   match(Set dst (RoundDouble (DivD src1 src2)));
10384 
10385   format %{ "FLD    $src1\n\t"
10386             "FDIV   ST,$src2\n\t"
10387             "FSTP_D $dst\t# D-round" %}
10388   opcode(0xD8, 0x6); /* D8 F0+i or D8 /6 */
10389   ins_encode( Push_Reg_D(src1),
10390               OpcP, RegOpc(src2), Pop_Mem_D(dst) );
10391   ins_pipe( fpu_mem_reg_reg );
10392 %}
10393 
10394 
10395 instruct modD_reg(regD dst, regD src, eAXRegI rax, eFlagsReg cr) %{
10396   predicate(UseSSE<=1);
10397   match(Set dst (ModD dst src));
10398   effect(KILL rax, KILL cr); // emitModD() uses EAX and EFLAGS
10399 
10400   format %{ "DMOD   $dst,$src" %}
10401   ins_cost(250);
10402   ins_encode(Push_Reg_Mod_D(dst, src),
10403               emitModD(),
10404               Push_Result_Mod_D(src),
10405               Pop_Reg_D(dst));
10406   ins_pipe( pipe_slow );
10407 %}
10408 
10409 instruct modXD_reg(regXD dst, regXD src0, regXD src1, eAXRegI rax, eFlagsReg cr) %{
10410   predicate(UseSSE>=2);
10411   match(Set dst (ModD src0 src1));
10412   effect(KILL rax, KILL cr);
10413 
10414   format %{ "SUB    ESP,8\t # DMOD\n"
10415           "\tMOVSD  [ESP+0],$src1\n"
10416           "\tFLD_D  [ESP+0]\n"
10417           "\tMOVSD  [ESP+0],$src0\n"
10418           "\tFLD_D  [ESP+0]\n"
10419      "loop:\tFPREM\n"
10420           "\tFWAIT\n"
10421           "\tFNSTSW AX\n"
10422           "\tSAHF\n"
10423           "\tJP     loop\n"
10424           "\tFSTP_D [ESP+0]\n"
10425           "\tMOVSD  $dst,[ESP+0]\n"
10426           "\tADD    ESP,8\n"
10427           "\tFSTP   ST0\t # Restore FPU Stack"
10428     %}
10429   ins_cost(250);
10430   ins_encode( Push_ModD_encoding(src0, src1), emitModD(), Push_ResultXD(dst), PopFPU);
10431   ins_pipe( pipe_slow );
10432 %}
10433 
10434 instruct sinD_reg(regDPR1 dst, regDPR1 src) %{
10435   predicate (UseSSE<=1);
10436   match(Set dst (SinD src));
10437   ins_cost(1800);
10438   format %{ "DSIN   $dst" %}
10439   opcode(0xD9, 0xFE);
10440   ins_encode( OpcP, OpcS );
10441   ins_pipe( pipe_slow );
10442 %}
10443 
10444 instruct sinXD_reg(regXD dst, eFlagsReg cr) %{
10445   predicate (UseSSE>=2);
10446   match(Set dst (SinD dst));
10447   effect(KILL cr); // Push_{Src|Result}XD() uses "{SUB|ADD} ESP,8"
10448   ins_cost(1800);
10449   format %{ "DSIN   $dst" %}
10450   opcode(0xD9, 0xFE);
10451   ins_encode( Push_SrcXD(dst), OpcP, OpcS, Push_ResultXD(dst) );
10452   ins_pipe( pipe_slow );
10453 %}
10454 
10455 instruct cosD_reg(regDPR1 dst, regDPR1 src) %{
10456   predicate (UseSSE<=1);
10457   match(Set dst (CosD src));
10458   ins_cost(1800);
10459   format %{ "DCOS   $dst" %}
10460   opcode(0xD9, 0xFF);
10461   ins_encode( OpcP, OpcS );
10462   ins_pipe( pipe_slow );
10463 %}
10464 
10465 instruct cosXD_reg(regXD dst, eFlagsReg cr) %{
10466   predicate (UseSSE>=2);
10467   match(Set dst (CosD dst));
10468   effect(KILL cr); // Push_{Src|Result}XD() uses "{SUB|ADD} ESP,8"
10469   ins_cost(1800);
10470   format %{ "DCOS   $dst" %}
10471   opcode(0xD9, 0xFF);
10472   ins_encode( Push_SrcXD(dst), OpcP, OpcS, Push_ResultXD(dst) );
10473   ins_pipe( pipe_slow );
10474 %}
10475 
10476 instruct tanD_reg(regDPR1 dst, regDPR1 src) %{
10477   predicate (UseSSE<=1);
10478   match(Set dst(TanD src));
10479   format %{ "DTAN   $dst" %}
10480   ins_encode( Opcode(0xD9), Opcode(0xF2),    // fptan
10481               Opcode(0xDD), Opcode(0xD8));   // fstp st
10482   ins_pipe( pipe_slow );
10483 %}
10484 
10485 instruct tanXD_reg(regXD dst, eFlagsReg cr) %{
10486   predicate (UseSSE>=2);
10487   match(Set dst(TanD dst));
10488   effect(KILL cr); // Push_{Src|Result}XD() uses "{SUB|ADD} ESP,8"
10489   format %{ "DTAN   $dst" %}
10490   ins_encode( Push_SrcXD(dst),
10491               Opcode(0xD9), Opcode(0xF2),    // fptan
10492               Opcode(0xDD), Opcode(0xD8),   // fstp st
10493               Push_ResultXD(dst) );
10494   ins_pipe( pipe_slow );
10495 %}
10496 
10497 instruct atanD_reg(regD dst, regD src) %{
10498   predicate (UseSSE<=1);
10499   match(Set dst(AtanD dst src));
10500   format %{ "DATA   $dst,$src" %}
10501   opcode(0xD9, 0xF3);
10502   ins_encode( Push_Reg_D(src),
10503               OpcP, OpcS, RegOpc(dst) );
10504   ins_pipe( pipe_slow );
10505 %}
10506 
10507 instruct atanXD_reg(regXD dst, regXD src, eFlagsReg cr) %{
10508   predicate (UseSSE>=2);
10509   match(Set dst(AtanD dst src));
10510   effect(KILL cr); // Push_{Src|Result}XD() uses "{SUB|ADD} ESP,8"
10511   format %{ "DATA   $dst,$src" %}
10512   opcode(0xD9, 0xF3);
10513   ins_encode( Push_SrcXD(src),
10514               OpcP, OpcS, Push_ResultXD(dst) );
10515   ins_pipe( pipe_slow );
10516 %}
10517 
10518 instruct sqrtD_reg(regD dst, regD src) %{
10519   predicate (UseSSE<=1);
10520   match(Set dst (SqrtD src));
10521   format %{ "DSQRT  $dst,$src" %}
10522   opcode(0xFA, 0xD9);
10523   ins_encode( Push_Reg_D(src),
10524               OpcS, OpcP, Pop_Reg_D(dst) );
10525   ins_pipe( pipe_slow );
10526 %}
10527 
10528 instruct powD_reg(regD X, regDPR1 Y, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
10529   predicate (UseSSE<=1);
10530   match(Set Y (PowD X Y));  // Raise X to the Yth power
10531   effect(KILL rax, KILL rbx, KILL rcx);
10532   format %{ "SUB    ESP,8\t\t# Fast-path POW encoding\n\t"
10533             "FLD_D  $X\n\t"
10534             "FYL2X  \t\t\t# Q=Y*ln2(X)\n\t"
10535 
10536             "FDUP   \t\t\t# Q Q\n\t"
10537             "FRNDINT\t\t\t# int(Q) Q\n\t"
10538             "FSUB   ST(1),ST(0)\t# int(Q) frac(Q)\n\t"
10539             "FISTP  dword [ESP]\n\t"
10540             "F2XM1  \t\t\t# 2^frac(Q)-1 int(Q)\n\t"
10541             "FLD1   \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t"
10542             "FADDP  \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead
10543             "MOV    EAX,[ESP]\t# Pick up int(Q)\n\t"
10544             "MOV    ECX,0xFFFFF800\t# Overflow mask\n\t"
10545             "ADD    EAX,1023\t\t# Double exponent bias\n\t"
10546             "MOV    EBX,EAX\t\t# Preshifted biased expo\n\t"
10547             "SHL    EAX,20\t\t# Shift exponent into place\n\t"
10548             "TEST   EBX,ECX\t\t# Check for overflow\n\t"
10549             "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t"
10550             "MOV    [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t"
10551             "MOV    [ESP+0],0\n\t"
10552             "FMUL   ST(0),[ESP+0]\t# Scale\n\t"
10553 
10554             "ADD    ESP,8"
10555              %}
10556   ins_encode( push_stack_temp_qword,
10557               Push_Reg_D(X),
10558               Opcode(0xD9), Opcode(0xF1),   // fyl2x
10559               pow_exp_core_encoding,
10560               pop_stack_temp_qword);
10561   ins_pipe( pipe_slow );
10562 %}
10563 
10564 instruct powXD_reg(regXD dst, regXD src0, regXD src1, regDPR1 tmp1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx ) %{
10565   predicate (UseSSE>=2);
10566   match(Set dst (PowD src0 src1));  // Raise src0 to the src1'th power
10567   effect(KILL tmp1, KILL rax, KILL rbx, KILL rcx );
10568   format %{ "SUB    ESP,8\t\t# Fast-path POW encoding\n\t"
10569             "MOVSD  [ESP],$src1\n\t"
10570             "FLD    FPR1,$src1\n\t"
10571             "MOVSD  [ESP],$src0\n\t"
10572             "FLD    FPR1,$src0\n\t"
10573             "FYL2X  \t\t\t# Q=Y*ln2(X)\n\t"
10574 
10575             "FDUP   \t\t\t# Q Q\n\t"
10576             "FRNDINT\t\t\t# int(Q) Q\n\t"
10577             "FSUB   ST(1),ST(0)\t# int(Q) frac(Q)\n\t"
10578             "FISTP  dword [ESP]\n\t"
10579             "F2XM1  \t\t\t# 2^frac(Q)-1 int(Q)\n\t"
10580             "FLD1   \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t"
10581             "FADDP  \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead
10582             "MOV    EAX,[ESP]\t# Pick up int(Q)\n\t"
10583             "MOV    ECX,0xFFFFF800\t# Overflow mask\n\t"
10584             "ADD    EAX,1023\t\t# Double exponent bias\n\t"
10585             "MOV    EBX,EAX\t\t# Preshifted biased expo\n\t"
10586             "SHL    EAX,20\t\t# Shift exponent into place\n\t"
10587             "TEST   EBX,ECX\t\t# Check for overflow\n\t"
10588             "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t"
10589             "MOV    [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t"
10590             "MOV    [ESP+0],0\n\t"
10591             "FMUL   ST(0),[ESP+0]\t# Scale\n\t"
10592 
10593             "FST_D  [ESP]\n\t"
10594             "MOVSD  $dst,[ESP]\n\t"
10595             "ADD    ESP,8"
10596              %}
10597   ins_encode( push_stack_temp_qword,
10598               push_xmm_to_fpr1(src1),
10599               push_xmm_to_fpr1(src0),
10600               Opcode(0xD9), Opcode(0xF1),   // fyl2x
10601               pow_exp_core_encoding,
10602               Push_ResultXD(dst) );
10603   ins_pipe( pipe_slow );
10604 %}
10605 
10606 
10607 instruct expD_reg(regDPR1 dpr1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
10608   predicate (UseSSE<=1);
10609   match(Set dpr1 (ExpD dpr1));
10610   effect(KILL rax, KILL rbx, KILL rcx);
10611   format %{ "SUB    ESP,8\t\t# Fast-path EXP encoding"
10612             "FLDL2E \t\t\t# Ld log2(e) X\n\t"
10613             "FMULP  \t\t\t# Q=X*log2(e)\n\t"
10614 
10615             "FDUP   \t\t\t# Q Q\n\t"
10616             "FRNDINT\t\t\t# int(Q) Q\n\t"
10617             "FSUB   ST(1),ST(0)\t# int(Q) frac(Q)\n\t"
10618             "FISTP  dword [ESP]\n\t"
10619             "F2XM1  \t\t\t# 2^frac(Q)-1 int(Q)\n\t"
10620             "FLD1   \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t"
10621             "FADDP  \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead
10622             "MOV    EAX,[ESP]\t# Pick up int(Q)\n\t"
10623             "MOV    ECX,0xFFFFF800\t# Overflow mask\n\t"
10624             "ADD    EAX,1023\t\t# Double exponent bias\n\t"
10625             "MOV    EBX,EAX\t\t# Preshifted biased expo\n\t"
10626             "SHL    EAX,20\t\t# Shift exponent into place\n\t"
10627             "TEST   EBX,ECX\t\t# Check for overflow\n\t"
10628             "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t"
10629             "MOV    [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t"
10630             "MOV    [ESP+0],0\n\t"
10631             "FMUL   ST(0),[ESP+0]\t# Scale\n\t"
10632 
10633             "ADD    ESP,8"
10634              %}
10635   ins_encode( push_stack_temp_qword,
10636               Opcode(0xD9), Opcode(0xEA),   // fldl2e
10637               Opcode(0xDE), Opcode(0xC9),   // fmulp
10638               pow_exp_core_encoding,
10639               pop_stack_temp_qword);
10640   ins_pipe( pipe_slow );
10641 %}
10642 
10643 instruct expXD_reg(regXD dst, regXD src, regDPR1 tmp1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
10644   predicate (UseSSE>=2);
10645   match(Set dst (ExpD src));
10646   effect(KILL tmp1, KILL rax, KILL rbx, KILL rcx);
10647   format %{ "SUB    ESP,8\t\t# Fast-path EXP encoding\n\t"
10648             "MOVSD  [ESP],$src\n\t"
10649             "FLDL2E \t\t\t# Ld log2(e) X\n\t"
10650             "FMULP  \t\t\t# Q=X*log2(e) X\n\t"
10651 
10652             "FDUP   \t\t\t# Q Q\n\t"
10653             "FRNDINT\t\t\t# int(Q) Q\n\t"
10654             "FSUB   ST(1),ST(0)\t# int(Q) frac(Q)\n\t"
10655             "FISTP  dword [ESP]\n\t"
10656             "F2XM1  \t\t\t# 2^frac(Q)-1 int(Q)\n\t"
10657             "FLD1   \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t"
10658             "FADDP  \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead
10659             "MOV    EAX,[ESP]\t# Pick up int(Q)\n\t"
10660             "MOV    ECX,0xFFFFF800\t# Overflow mask\n\t"
10661             "ADD    EAX,1023\t\t# Double exponent bias\n\t"
10662             "MOV    EBX,EAX\t\t# Preshifted biased expo\n\t"
10663             "SHL    EAX,20\t\t# Shift exponent into place\n\t"
10664             "TEST   EBX,ECX\t\t# Check for overflow\n\t"
10665             "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t"
10666             "MOV    [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t"
10667             "MOV    [ESP+0],0\n\t"
10668             "FMUL   ST(0),[ESP+0]\t# Scale\n\t"
10669 
10670             "FST_D  [ESP]\n\t"
10671             "MOVSD  $dst,[ESP]\n\t"
10672             "ADD    ESP,8"
10673              %}
10674   ins_encode( Push_SrcXD(src),
10675               Opcode(0xD9), Opcode(0xEA),   // fldl2e
10676               Opcode(0xDE), Opcode(0xC9),   // fmulp
10677               pow_exp_core_encoding,
10678               Push_ResultXD(dst) );
10679   ins_pipe( pipe_slow );
10680 %}
10681 
10682 
10683 
10684 instruct log10D_reg(regDPR1 dst, regDPR1 src) %{
10685   predicate (UseSSE<=1);
10686   // The source Double operand on FPU stack
10687   match(Set dst (Log10D src));
10688   // fldlg2       ; push log_10(2) on the FPU stack; full 80-bit number
10689   // fxch         ; swap ST(0) with ST(1)
10690   // fyl2x        ; compute log_10(2) * log_2(x)
10691   format %{ "FLDLG2 \t\t\t#Log10\n\t"
10692             "FXCH   \n\t"
10693             "FYL2X  \t\t\t# Q=Log10*Log_2(x)"
10694          %}
10695   ins_encode( Opcode(0xD9), Opcode(0xEC),   // fldlg2
10696               Opcode(0xD9), Opcode(0xC9),   // fxch
10697               Opcode(0xD9), Opcode(0xF1));  // fyl2x
10698 
10699   ins_pipe( pipe_slow );
10700 %}
10701 
10702 instruct log10XD_reg(regXD dst, regXD src, eFlagsReg cr) %{
10703   predicate (UseSSE>=2);
10704   effect(KILL cr);
10705   match(Set dst (Log10D src));
10706   // fldlg2       ; push log_10(2) on the FPU stack; full 80-bit number
10707   // fyl2x        ; compute log_10(2) * log_2(x)
10708   format %{ "FLDLG2 \t\t\t#Log10\n\t"
10709             "FYL2X  \t\t\t# Q=Log10*Log_2(x)"
10710          %}
10711   ins_encode( Opcode(0xD9), Opcode(0xEC),   // fldlg2
10712               Push_SrcXD(src),
10713               Opcode(0xD9), Opcode(0xF1),   // fyl2x
10714               Push_ResultXD(dst));
10715 
10716   ins_pipe( pipe_slow );
10717 %}
10718 
10719 instruct logD_reg(regDPR1 dst, regDPR1 src) %{
10720   predicate (UseSSE<=1);
10721   // The source Double operand on FPU stack
10722   match(Set dst (LogD src));
10723   // fldln2       ; push log_e(2) on the FPU stack; full 80-bit number
10724   // fxch         ; swap ST(0) with ST(1)
10725   // fyl2x        ; compute log_e(2) * log_2(x)
10726   format %{ "FLDLN2 \t\t\t#Log_e\n\t"
10727             "FXCH   \n\t"
10728             "FYL2X  \t\t\t# Q=Log_e*Log_2(x)"
10729          %}
10730   ins_encode( Opcode(0xD9), Opcode(0xED),   // fldln2
10731               Opcode(0xD9), Opcode(0xC9),   // fxch
10732               Opcode(0xD9), Opcode(0xF1));  // fyl2x
10733 
10734   ins_pipe( pipe_slow );
10735 %}
10736 
10737 instruct logXD_reg(regXD dst, regXD src, eFlagsReg cr) %{
10738   predicate (UseSSE>=2);
10739   effect(KILL cr);
10740   // The source and result Double operands in XMM registers
10741   match(Set dst (LogD src));
10742   // fldln2       ; push log_e(2) on the FPU stack; full 80-bit number
10743   // fyl2x        ; compute log_e(2) * log_2(x)
10744   format %{ "FLDLN2 \t\t\t#Log_e\n\t"
10745             "FYL2X  \t\t\t# Q=Log_e*Log_2(x)"
10746          %}
10747   ins_encode( Opcode(0xD9), Opcode(0xED),   // fldln2
10748               Push_SrcXD(src),
10749               Opcode(0xD9), Opcode(0xF1),   // fyl2x
10750               Push_ResultXD(dst));
10751   ins_pipe( pipe_slow );
10752 %}
10753 
10754 //-------------Float Instructions-------------------------------
10755 // Float Math
10756 
10757 // Code for float compare:
10758 //     fcompp();
10759 //     fwait(); fnstsw_ax();
10760 //     sahf();
10761 //     movl(dst, unordered_result);
10762 //     jcc(Assembler::parity, exit);
10763 //     movl(dst, less_result);
10764 //     jcc(Assembler::below, exit);
10765 //     movl(dst, equal_result);
10766 //     jcc(Assembler::equal, exit);
10767 //     movl(dst, greater_result);
10768 //   exit:
10769 
10770 // P6 version of float compare, sets condition codes in EFLAGS
10771 instruct cmpF_cc_P6(eFlagsRegU cr, regF src1, regF src2, eAXRegI rax) %{
10772   predicate(VM_Version::supports_cmov() && UseSSE == 0);
10773   match(Set cr (CmpF src1 src2));
10774   effect(KILL rax);
10775   ins_cost(150);
10776   format %{ "FLD    $src1\n\t"
10777             "FUCOMIP ST,$src2  // P6 instruction\n\t"
10778             "JNP    exit\n\t"
10779             "MOV    ah,1       // saw a NaN, set CF (treat as LT)\n\t"
10780             "SAHF\n"
10781      "exit:\tNOP               // avoid branch to branch" %}
10782   opcode(0xDF, 0x05); /* DF E8+i or DF /5 */
10783   ins_encode( Push_Reg_D(src1),
10784               OpcP, RegOpc(src2),
10785               cmpF_P6_fixup );
10786   ins_pipe( pipe_slow );
10787 %}
10788 
10789 instruct cmpF_cc_P6CF(eFlagsRegUCF cr, regF src1, regF src2) %{
10790   predicate(VM_Version::supports_cmov() && UseSSE == 0);
10791   match(Set cr (CmpF src1 src2));
10792   ins_cost(100);
10793   format %{ "FLD    $src1\n\t"
10794             "FUCOMIP ST,$src2  // P6 instruction" %}
10795   opcode(0xDF, 0x05); /* DF E8+i or DF /5 */
10796   ins_encode( Push_Reg_D(src1),
10797               OpcP, RegOpc(src2));
10798   ins_pipe( pipe_slow );
10799 %}
10800 
10801 
10802 // Compare & branch
10803 instruct cmpF_cc(eFlagsRegU cr, regF src1, regF src2, eAXRegI rax) %{
10804   predicate(UseSSE == 0);
10805   match(Set cr (CmpF src1 src2));
10806   effect(KILL rax);
10807   ins_cost(200);
10808   format %{ "FLD    $src1\n\t"
10809             "FCOMp  $src2\n\t"
10810             "FNSTSW AX\n\t"
10811             "TEST   AX,0x400\n\t"
10812             "JZ,s   flags\n\t"
10813             "MOV    AH,1\t# unordered treat as LT\n"
10814     "flags:\tSAHF" %}
10815   opcode(0xD8, 0x3); /* D8 D8+i or D8 /3 */
10816   ins_encode( Push_Reg_D(src1),
10817               OpcP, RegOpc(src2),
10818               fpu_flags);
10819   ins_pipe( pipe_slow );
10820 %}
10821 
10822 // Compare vs zero into -1,0,1
10823 instruct cmpF_0(eRegI dst, regF src1, immF0 zero, eAXRegI rax, eFlagsReg cr) %{
10824   predicate(UseSSE == 0);
10825   match(Set dst (CmpF3 src1 zero));
10826   effect(KILL cr, KILL rax);
10827   ins_cost(280);
10828   format %{ "FTSTF  $dst,$src1" %}
10829   opcode(0xE4, 0xD9);
10830   ins_encode( Push_Reg_D(src1),
10831               OpcS, OpcP, PopFPU,
10832               CmpF_Result(dst));
10833   ins_pipe( pipe_slow );
10834 %}
10835 
10836 // Compare into -1,0,1
10837 instruct cmpF_reg(eRegI dst, regF src1, regF src2, eAXRegI rax, eFlagsReg cr) %{
10838   predicate(UseSSE == 0);
10839   match(Set dst (CmpF3 src1 src2));
10840   effect(KILL cr, KILL rax);
10841   ins_cost(300);
10842   format %{ "FCMPF  $dst,$src1,$src2" %}
10843   opcode(0xD8, 0x3); /* D8 D8+i or D8 /3 */
10844   ins_encode( Push_Reg_D(src1),
10845               OpcP, RegOpc(src2),
10846               CmpF_Result(dst));
10847   ins_pipe( pipe_slow );
10848 %}
10849 
10850 // float compare and set condition codes in EFLAGS by XMM regs
10851 instruct cmpX_cc(eFlagsRegU cr, regX dst, regX src, eAXRegI rax) %{
10852   predicate(UseSSE>=1);
10853   match(Set cr (CmpF dst src));
10854   effect(KILL rax);
10855   ins_cost(145);
10856   format %{ "COMISS $dst,$src\n"
10857           "\tJNP    exit\n"
10858           "\tMOV    ah,1       // saw a NaN, set CF\n"
10859           "\tSAHF\n"
10860      "exit:\tNOP               // avoid branch to branch" %}
10861   opcode(0x0F, 0x2F);
10862   ins_encode(OpcP, OpcS, RegReg(dst, src), cmpF_P6_fixup);
10863   ins_pipe( pipe_slow );
10864 %}
10865 
10866 instruct cmpX_ccCF(eFlagsRegUCF cr, regX dst, regX src) %{
10867   predicate(UseSSE>=1);
10868   match(Set cr (CmpF dst src));
10869   ins_cost(100);
10870   format %{ "COMISS $dst,$src" %}
10871   opcode(0x0F, 0x2F);
10872   ins_encode(OpcP, OpcS, RegReg(dst, src));
10873   ins_pipe( pipe_slow );
10874 %}
10875 
10876 // float compare and set condition codes in EFLAGS by XMM regs
10877 instruct cmpX_ccmem(eFlagsRegU cr, regX dst, memory src, eAXRegI rax) %{
10878   predicate(UseSSE>=1);
10879   match(Set cr (CmpF dst (LoadF src)));
10880   effect(KILL rax);
10881   ins_cost(165);
10882   format %{ "COMISS $dst,$src\n"
10883           "\tJNP    exit\n"
10884           "\tMOV    ah,1       // saw a NaN, set CF\n"
10885           "\tSAHF\n"
10886      "exit:\tNOP               // avoid branch to branch" %}
10887   opcode(0x0F, 0x2F);
10888   ins_encode(OpcP, OpcS, RegMem(dst, src), cmpF_P6_fixup);
10889   ins_pipe( pipe_slow );
10890 %}
10891 
10892 instruct cmpX_ccmemCF(eFlagsRegUCF cr, regX dst, memory src) %{
10893   predicate(UseSSE>=1);
10894   match(Set cr (CmpF dst (LoadF src)));
10895   ins_cost(100);
10896   format %{ "COMISS $dst,$src" %}
10897   opcode(0x0F, 0x2F);
10898   ins_encode(OpcP, OpcS, RegMem(dst, src));
10899   ins_pipe( pipe_slow );
10900 %}
10901 
10902 // Compare into -1,0,1 in XMM
10903 instruct cmpX_reg(eRegI dst, regX src1, regX src2, eFlagsReg cr) %{
10904   predicate(UseSSE>=1);
10905   match(Set dst (CmpF3 src1 src2));
10906   effect(KILL cr);
10907   ins_cost(255);
10908   format %{ "XOR    $dst,$dst\n"
10909           "\tCOMISS $src1,$src2\n"
10910           "\tJP,s   nan\n"
10911           "\tJEQ,s  exit\n"
10912           "\tJA,s   inc\n"
10913       "nan:\tDEC    $dst\n"
10914           "\tJMP,s  exit\n"
10915       "inc:\tINC    $dst\n"
10916       "exit:"
10917                 %}
10918   opcode(0x0F, 0x2F);
10919   ins_encode(Xor_Reg(dst), OpcP, OpcS, RegReg(src1, src2), CmpX_Result(dst));
10920   ins_pipe( pipe_slow );
10921 %}
10922 
10923 // Compare into -1,0,1 in XMM and memory
10924 instruct cmpX_regmem(eRegI dst, regX src1, memory mem, eFlagsReg cr) %{
10925   predicate(UseSSE>=1);
10926   match(Set dst (CmpF3 src1 (LoadF mem)));
10927   effect(KILL cr);
10928   ins_cost(275);
10929   format %{ "COMISS $src1,$mem\n"
10930           "\tMOV    $dst,0\t\t# do not blow flags\n"
10931           "\tJP,s   nan\n"
10932           "\tJEQ,s  exit\n"
10933           "\tJA,s   inc\n"
10934       "nan:\tDEC    $dst\n"
10935           "\tJMP,s  exit\n"
10936       "inc:\tINC    $dst\n"
10937       "exit:"
10938                 %}
10939   opcode(0x0F, 0x2F);
10940   ins_encode(OpcP, OpcS, RegMem(src1, mem), LdImmI(dst,0x0), CmpX_Result(dst));
10941   ins_pipe( pipe_slow );
10942 %}
10943 
10944 // Spill to obtain 24-bit precision
10945 instruct subF24_reg(stackSlotF dst, regF src1, regF src2) %{
10946   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
10947   match(Set dst (SubF src1 src2));
10948 
10949   format %{ "FSUB   $dst,$src1 - $src2" %}
10950   opcode(0xD8, 0x4); /* D8 E0+i or D8 /4 mod==0x3 ;; result in TOS */
10951   ins_encode( Push_Reg_F(src1),
10952               OpcReg_F(src2),
10953               Pop_Mem_F(dst) );
10954   ins_pipe( fpu_mem_reg_reg );
10955 %}
10956 //
10957 // This instruction does not round to 24-bits
10958 instruct subF_reg(regF dst, regF src) %{
10959   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
10960   match(Set dst (SubF dst src));
10961 
10962   format %{ "FSUB   $dst,$src" %}
10963   opcode(0xDE, 0x5); /* DE E8+i  or DE /5 */
10964   ins_encode( Push_Reg_F(src),
10965               OpcP, RegOpc(dst) );
10966   ins_pipe( fpu_reg_reg );
10967 %}
10968 
10969 // Spill to obtain 24-bit precision
10970 instruct addF24_reg(stackSlotF dst, regF src1, regF src2) %{
10971   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
10972   match(Set dst (AddF src1 src2));
10973 
10974   format %{ "FADD   $dst,$src1,$src2" %}
10975   opcode(0xD8, 0x0); /* D8 C0+i */
10976   ins_encode( Push_Reg_F(src2),
10977               OpcReg_F(src1),
10978               Pop_Mem_F(dst) );
10979   ins_pipe( fpu_mem_reg_reg );
10980 %}
10981 //
10982 // This instruction does not round to 24-bits
10983 instruct addF_reg(regF dst, regF src) %{
10984   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
10985   match(Set dst (AddF dst src));
10986 
10987   format %{ "FLD    $src\n\t"
10988             "FADDp  $dst,ST" %}
10989   opcode(0xDE, 0x0); /* DE C0+i or DE /0*/
10990   ins_encode( Push_Reg_F(src),
10991               OpcP, RegOpc(dst) );
10992   ins_pipe( fpu_reg_reg );
10993 %}
10994 
10995 // Add two single precision floating point values in xmm
10996 instruct addX_reg(regX dst, regX src) %{
10997   predicate(UseSSE>=1);
10998   match(Set dst (AddF dst src));
10999   format %{ "ADDSS  $dst,$src" %}
11000   ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x58), RegReg(dst, src));
11001   ins_pipe( pipe_slow );
11002 %}
11003 
11004 instruct addX_imm(regX dst, immXF con) %{
11005   predicate(UseSSE>=1);
11006   match(Set dst (AddF dst con));
11007   format %{ "ADDSS  $dst,[$con]" %}
11008   ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x58), LdImmX(dst, con) );
11009   ins_pipe( pipe_slow );
11010 %}
11011 
11012 instruct addX_mem(regX dst, memory mem) %{
11013   predicate(UseSSE>=1);
11014   match(Set dst (AddF dst (LoadF mem)));
11015   format %{ "ADDSS  $dst,$mem" %}
11016   ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x58), RegMem(dst, mem));
11017   ins_pipe( pipe_slow );
11018 %}
11019 
11020 // Subtract two single precision floating point values in xmm
11021 instruct subX_reg(regX dst, regX src) %{
11022   predicate(UseSSE>=1);
11023   match(Set dst (SubF dst src));
11024   format %{ "SUBSS  $dst,$src" %}
11025   ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x5C), RegReg(dst, src));
11026   ins_pipe( pipe_slow );
11027 %}
11028 
11029 instruct subX_imm(regX dst, immXF con) %{
11030   predicate(UseSSE>=1);
11031   match(Set dst (SubF dst con));
11032   format %{ "SUBSS  $dst,[$con]" %}
11033   ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x5C), LdImmX(dst, con) );
11034   ins_pipe( pipe_slow );
11035 %}
11036 
11037 instruct subX_mem(regX dst, memory mem) %{
11038   predicate(UseSSE>=1);
11039   match(Set dst (SubF dst (LoadF mem)));
11040   format %{ "SUBSS  $dst,$mem" %}
11041   ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x5C), RegMem(dst,mem));
11042   ins_pipe( pipe_slow );
11043 %}
11044 
11045 // Multiply two single precision floating point values in xmm
11046 instruct mulX_reg(regX dst, regX src) %{
11047   predicate(UseSSE>=1);
11048   match(Set dst (MulF dst src));
11049   format %{ "MULSS  $dst,$src" %}
11050   ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x59), RegReg(dst, src));
11051   ins_pipe( pipe_slow );
11052 %}
11053 
11054 instruct mulX_imm(regX dst, immXF con) %{
11055   predicate(UseSSE>=1);
11056   match(Set dst (MulF dst con));
11057   format %{ "MULSS  $dst,[$con]" %}
11058   ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x59), LdImmX(dst, con) );
11059   ins_pipe( pipe_slow );
11060 %}
11061 
11062 instruct mulX_mem(regX dst, memory mem) %{
11063   predicate(UseSSE>=1);
11064   match(Set dst (MulF dst (LoadF mem)));
11065   format %{ "MULSS  $dst,$mem" %}
11066   ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x59), RegMem(dst,mem));
11067   ins_pipe( pipe_slow );
11068 %}
11069 
11070 // Divide two single precision floating point values in xmm
11071 instruct divX_reg(regX dst, regX src) %{
11072   predicate(UseSSE>=1);
11073   match(Set dst (DivF dst src));
11074   format %{ "DIVSS  $dst,$src" %}
11075   ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x5E), RegReg(dst, src));
11076   ins_pipe( pipe_slow );
11077 %}
11078 
11079 instruct divX_imm(regX dst, immXF con) %{
11080   predicate(UseSSE>=1);
11081   match(Set dst (DivF dst con));
11082   format %{ "DIVSS  $dst,[$con]" %}
11083   ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x5E), LdImmX(dst, con) );
11084   ins_pipe( pipe_slow );
11085 %}
11086 
11087 instruct divX_mem(regX dst, memory mem) %{
11088   predicate(UseSSE>=1);
11089   match(Set dst (DivF dst (LoadF mem)));
11090   format %{ "DIVSS  $dst,$mem" %}
11091   ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x5E), RegMem(dst,mem));
11092   ins_pipe( pipe_slow );
11093 %}
11094 
11095 // Get the square root of a single precision floating point values in xmm
11096 instruct sqrtX_reg(regX dst, regX src) %{
11097   predicate(UseSSE>=1);
11098   match(Set dst (ConvD2F (SqrtD (ConvF2D src))));
11099   format %{ "SQRTSS $dst,$src" %}
11100   ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x51), RegReg(dst, src));
11101   ins_pipe( pipe_slow );
11102 %}
11103 
11104 instruct sqrtX_mem(regX dst, memory mem) %{
11105   predicate(UseSSE>=1);
11106   match(Set dst (ConvD2F (SqrtD (ConvF2D (LoadF mem)))));
11107   format %{ "SQRTSS $dst,$mem" %}
11108   ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x51), RegMem(dst, mem));
11109   ins_pipe( pipe_slow );
11110 %}
11111 
11112 // Get the square root of a double precision floating point values in xmm
11113 instruct sqrtXD_reg(regXD dst, regXD src) %{
11114   predicate(UseSSE>=2);
11115   match(Set dst (SqrtD src));
11116   format %{ "SQRTSD $dst,$src" %}
11117   ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x51), RegReg(dst, src));
11118   ins_pipe( pipe_slow );
11119 %}
11120 
11121 instruct sqrtXD_mem(regXD dst, memory mem) %{
11122   predicate(UseSSE>=2);
11123   match(Set dst (SqrtD (LoadD mem)));
11124   format %{ "SQRTSD $dst,$mem" %}
11125   ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x51), RegMem(dst, mem));
11126   ins_pipe( pipe_slow );
11127 %}
11128 
11129 instruct absF_reg(regFPR1 dst, regFPR1 src) %{
11130   predicate(UseSSE==0);
11131   match(Set dst (AbsF src));
11132   ins_cost(100);
11133   format %{ "FABS" %}
11134   opcode(0xE1, 0xD9);
11135   ins_encode( OpcS, OpcP );
11136   ins_pipe( fpu_reg_reg );
11137 %}
11138 
11139 instruct absX_reg(regX dst ) %{
11140   predicate(UseSSE>=1);
11141   match(Set dst (AbsF dst));
11142   format %{ "ANDPS  $dst,[0x7FFFFFFF]\t# ABS F by sign masking" %}
11143   ins_encode( AbsXF_encoding(dst));
11144   ins_pipe( pipe_slow );
11145 %}
11146 
11147 instruct negF_reg(regFPR1 dst, regFPR1 src) %{
11148   predicate(UseSSE==0);
11149   match(Set dst (NegF src));
11150   ins_cost(100);
11151   format %{ "FCHS" %}
11152   opcode(0xE0, 0xD9);
11153   ins_encode( OpcS, OpcP );
11154   ins_pipe( fpu_reg_reg );
11155 %}
11156 
11157 instruct negX_reg( regX dst ) %{
11158   predicate(UseSSE>=1);
11159   match(Set dst (NegF dst));
11160   format %{ "XORPS  $dst,[0x80000000]\t# CHS F by sign flipping" %}
11161   ins_encode( NegXF_encoding(dst));
11162   ins_pipe( pipe_slow );
11163 %}
11164 
11165 // Cisc-alternate to addF_reg
11166 // Spill to obtain 24-bit precision
11167 instruct addF24_reg_mem(stackSlotF dst, regF src1, memory src2) %{
11168   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
11169   match(Set dst (AddF src1 (LoadF src2)));
11170 
11171   format %{ "FLD    $src2\n\t"
11172             "FADD   ST,$src1\n\t"
11173             "FSTP_S $dst" %}
11174   opcode(0xD8, 0x0, 0xD9); /* D8 C0+i */  /* LoadF  D9 /0 */
11175   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
11176               OpcReg_F(src1),
11177               Pop_Mem_F(dst) );
11178   ins_pipe( fpu_mem_reg_mem );
11179 %}
11180 //
11181 // Cisc-alternate to addF_reg
11182 // This instruction does not round to 24-bits
11183 instruct addF_reg_mem(regF dst, memory src) %{
11184   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
11185   match(Set dst (AddF dst (LoadF src)));
11186 
11187   format %{ "FADD   $dst,$src" %}
11188   opcode(0xDE, 0x0, 0xD9); /* DE C0+i or DE /0*/  /* LoadF  D9 /0 */
11189   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src),
11190               OpcP, RegOpc(dst) );
11191   ins_pipe( fpu_reg_mem );
11192 %}
11193 
11194 // // Following two instructions for _222_mpegaudio
11195 // Spill to obtain 24-bit precision
11196 instruct addF24_mem_reg(stackSlotF dst, regF src2, memory src1 ) %{
11197   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
11198   match(Set dst (AddF src1 src2));
11199 
11200   format %{ "FADD   $dst,$src1,$src2" %}
11201   opcode(0xD8, 0x0, 0xD9); /* D8 C0+i */  /* LoadF  D9 /0 */
11202   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src1),
11203               OpcReg_F(src2),
11204               Pop_Mem_F(dst) );
11205   ins_pipe( fpu_mem_reg_mem );
11206 %}
11207 
11208 // Cisc-spill variant
11209 // Spill to obtain 24-bit precision
11210 instruct addF24_mem_cisc(stackSlotF dst, memory src1, memory src2) %{
11211   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
11212   match(Set dst (AddF src1 (LoadF src2)));
11213 
11214   format %{ "FADD   $dst,$src1,$src2 cisc" %}
11215   opcode(0xD8, 0x0, 0xD9); /* D8 C0+i */  /* LoadF  D9 /0 */
11216   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
11217               set_instruction_start,
11218               OpcP, RMopc_Mem(secondary,src1),
11219               Pop_Mem_F(dst) );
11220   ins_pipe( fpu_mem_mem_mem );
11221 %}
11222 
11223 // Spill to obtain 24-bit precision
11224 instruct addF24_mem_mem(stackSlotF dst, memory src1, memory src2) %{
11225   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
11226   match(Set dst (AddF src1 src2));
11227 
11228   format %{ "FADD   $dst,$src1,$src2" %}
11229   opcode(0xD8, 0x0, 0xD9); /* D8 /0 */  /* LoadF  D9 /0 */
11230   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
11231               set_instruction_start,
11232               OpcP, RMopc_Mem(secondary,src1),
11233               Pop_Mem_F(dst) );
11234   ins_pipe( fpu_mem_mem_mem );
11235 %}
11236 
11237 
11238 // Spill to obtain 24-bit precision
11239 instruct addF24_reg_imm(stackSlotF dst, regF src1, immF src2) %{
11240   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
11241   match(Set dst (AddF src1 src2));
11242   format %{ "FLD    $src1\n\t"
11243             "FADD   $src2\n\t"
11244             "FSTP_S $dst"  %}
11245   opcode(0xD8, 0x00);       /* D8 /0 */
11246   ins_encode( Push_Reg_F(src1),
11247               Opc_MemImm_F(src2),
11248               Pop_Mem_F(dst));
11249   ins_pipe( fpu_mem_reg_con );
11250 %}
11251 //
11252 // This instruction does not round to 24-bits
11253 instruct addF_reg_imm(regF dst, regF src1, immF src2) %{
11254   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
11255   match(Set dst (AddF src1 src2));
11256   format %{ "FLD    $src1\n\t"
11257             "FADD   $src2\n\t"
11258             "FSTP_S $dst"  %}
11259   opcode(0xD8, 0x00);       /* D8 /0 */
11260   ins_encode( Push_Reg_F(src1),
11261               Opc_MemImm_F(src2),
11262               Pop_Reg_F(dst));
11263   ins_pipe( fpu_reg_reg_con );
11264 %}
11265 
11266 // Spill to obtain 24-bit precision
11267 instruct mulF24_reg(stackSlotF dst, regF src1, regF src2) %{
11268   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
11269   match(Set dst (MulF src1 src2));
11270 
11271   format %{ "FLD    $src1\n\t"
11272             "FMUL   $src2\n\t"
11273             "FSTP_S $dst"  %}
11274   opcode(0xD8, 0x1); /* D8 C8+i or D8 /1 ;; result in TOS */
11275   ins_encode( Push_Reg_F(src1),
11276               OpcReg_F(src2),
11277               Pop_Mem_F(dst) );
11278   ins_pipe( fpu_mem_reg_reg );
11279 %}
11280 //
11281 // This instruction does not round to 24-bits
11282 instruct mulF_reg(regF dst, regF src1, regF src2) %{
11283   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
11284   match(Set dst (MulF src1 src2));
11285 
11286   format %{ "FLD    $src1\n\t"
11287             "FMUL   $src2\n\t"
11288             "FSTP_S $dst"  %}
11289   opcode(0xD8, 0x1); /* D8 C8+i */
11290   ins_encode( Push_Reg_F(src2),
11291               OpcReg_F(src1),
11292               Pop_Reg_F(dst) );
11293   ins_pipe( fpu_reg_reg_reg );
11294 %}
11295 
11296 
11297 // Spill to obtain 24-bit precision
11298 // Cisc-alternate to reg-reg multiply
11299 instruct mulF24_reg_mem(stackSlotF dst, regF src1, memory src2) %{
11300   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
11301   match(Set dst (MulF src1 (LoadF src2)));
11302 
11303   format %{ "FLD_S  $src2\n\t"
11304             "FMUL   $src1\n\t"
11305             "FSTP_S $dst"  %}
11306   opcode(0xD8, 0x1, 0xD9); /* D8 C8+i or DE /1*/  /* LoadF D9 /0 */
11307   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
11308               OpcReg_F(src1),
11309               Pop_Mem_F(dst) );
11310   ins_pipe( fpu_mem_reg_mem );
11311 %}
11312 //
11313 // This instruction does not round to 24-bits
11314 // Cisc-alternate to reg-reg multiply
11315 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
11316   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
11317   match(Set dst (MulF src1 (LoadF src2)));
11318 
11319   format %{ "FMUL   $dst,$src1,$src2" %}
11320   opcode(0xD8, 0x1, 0xD9); /* D8 C8+i */  /* LoadF D9 /0 */
11321   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
11322               OpcReg_F(src1),
11323               Pop_Reg_F(dst) );
11324   ins_pipe( fpu_reg_reg_mem );
11325 %}
11326 
11327 // Spill to obtain 24-bit precision
11328 instruct mulF24_mem_mem(stackSlotF dst, memory src1, memory src2) %{
11329   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
11330   match(Set dst (MulF src1 src2));
11331 
11332   format %{ "FMUL   $dst,$src1,$src2" %}
11333   opcode(0xD8, 0x1, 0xD9); /* D8 /1 */  /* LoadF D9 /0 */
11334   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
11335               set_instruction_start,
11336               OpcP, RMopc_Mem(secondary,src1),
11337               Pop_Mem_F(dst) );
11338   ins_pipe( fpu_mem_mem_mem );
11339 %}
11340 
11341 // Spill to obtain 24-bit precision
11342 instruct mulF24_reg_imm(stackSlotF dst, regF src1, immF src2) %{
11343   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
11344   match(Set dst (MulF src1 src2));
11345 
11346   format %{ "FMULc $dst,$src1,$src2" %}
11347   opcode(0xD8, 0x1);  /* D8 /1*/
11348   ins_encode( Push_Reg_F(src1),
11349               Opc_MemImm_F(src2),
11350               Pop_Mem_F(dst));
11351   ins_pipe( fpu_mem_reg_con );
11352 %}
11353 //
11354 // This instruction does not round to 24-bits
11355 instruct mulF_reg_imm(regF dst, regF src1, immF src2) %{
11356   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
11357   match(Set dst (MulF src1 src2));
11358 
11359   format %{ "FMULc $dst. $src1, $src2" %}
11360   opcode(0xD8, 0x1);  /* D8 /1*/
11361   ins_encode( Push_Reg_F(src1),
11362               Opc_MemImm_F(src2),
11363               Pop_Reg_F(dst));
11364   ins_pipe( fpu_reg_reg_con );
11365 %}
11366 
11367 
11368 //
11369 // MACRO1 -- subsume unshared load into mulF
11370 // This instruction does not round to 24-bits
11371 instruct mulF_reg_load1(regF dst, regF src, memory mem1 ) %{
11372   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
11373   match(Set dst (MulF (LoadF mem1) src));
11374 
11375   format %{ "FLD    $mem1    ===MACRO1===\n\t"
11376             "FMUL   ST,$src\n\t"
11377             "FSTP   $dst" %}
11378   opcode(0xD8, 0x1, 0xD9); /* D8 C8+i or D8 /1 */  /* LoadF D9 /0 */
11379   ins_encode( Opcode(tertiary), RMopc_Mem(0x00,mem1),
11380               OpcReg_F(src),
11381               Pop_Reg_F(dst) );
11382   ins_pipe( fpu_reg_reg_mem );
11383 %}
11384 //
11385 // MACRO2 -- addF a mulF which subsumed an unshared load
11386 // This instruction does not round to 24-bits
11387 instruct addF_mulF_reg_load1(regF dst, memory mem1, regF src1, regF src2) %{
11388   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
11389   match(Set dst (AddF (MulF (LoadF mem1) src1) src2));
11390   ins_cost(95);
11391 
11392   format %{ "FLD    $mem1     ===MACRO2===\n\t"
11393             "FMUL   ST,$src1  subsume mulF left load\n\t"
11394             "FADD   ST,$src2\n\t"
11395             "FSTP   $dst" %}
11396   opcode(0xD9); /* LoadF D9 /0 */
11397   ins_encode( OpcP, RMopc_Mem(0x00,mem1),
11398               FMul_ST_reg(src1),
11399               FAdd_ST_reg(src2),
11400               Pop_Reg_F(dst) );
11401   ins_pipe( fpu_reg_mem_reg_reg );
11402 %}
11403 
11404 // MACRO3 -- addF a mulF
11405 // This instruction does not round to 24-bits.  It is a '2-address'
11406 // instruction in that the result goes back to src2.  This eliminates
11407 // a move from the macro; possibly the register allocator will have
11408 // to add it back (and maybe not).
11409 instruct addF_mulF_reg(regF src2, regF src1, regF src0) %{
11410   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
11411   match(Set src2 (AddF (MulF src0 src1) src2));
11412 
11413   format %{ "FLD    $src0     ===MACRO3===\n\t"
11414             "FMUL   ST,$src1\n\t"
11415             "FADDP  $src2,ST" %}
11416   opcode(0xD9); /* LoadF D9 /0 */
11417   ins_encode( Push_Reg_F(src0),
11418               FMul_ST_reg(src1),
11419               FAddP_reg_ST(src2) );
11420   ins_pipe( fpu_reg_reg_reg );
11421 %}
11422 
11423 // MACRO4 -- divF subF
11424 // This instruction does not round to 24-bits
11425 instruct subF_divF_reg(regF dst, regF src1, regF src2, regF src3) %{
11426   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
11427   match(Set dst (DivF (SubF src2 src1) src3));
11428 
11429   format %{ "FLD    $src2   ===MACRO4===\n\t"
11430             "FSUB   ST,$src1\n\t"
11431             "FDIV   ST,$src3\n\t"
11432             "FSTP  $dst" %}
11433   opcode(0xDE, 0x7); /* DE F8+i or DE /7*/
11434   ins_encode( Push_Reg_F(src2),
11435               subF_divF_encode(src1,src3),
11436               Pop_Reg_F(dst) );
11437   ins_pipe( fpu_reg_reg_reg_reg );
11438 %}
11439 
11440 // Spill to obtain 24-bit precision
11441 instruct divF24_reg(stackSlotF dst, regF src1, regF src2) %{
11442   predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
11443   match(Set dst (DivF src1 src2));
11444 
11445   format %{ "FDIV   $dst,$src1,$src2" %}
11446   opcode(0xD8, 0x6); /* D8 F0+i or DE /6*/
11447   ins_encode( Push_Reg_F(src1),
11448               OpcReg_F(src2),
11449               Pop_Mem_F(dst) );
11450   ins_pipe( fpu_mem_reg_reg );
11451 %}
11452 //
11453 // This instruction does not round to 24-bits
11454 instruct divF_reg(regF dst, regF src) %{
11455   predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
11456   match(Set dst (DivF dst src));
11457 
11458   format %{ "FDIV   $dst,$src" %}
11459   opcode(0xDE, 0x7); /* DE F8+i or DE /7*/
11460   ins_encode( Push_Reg_F(src),
11461               OpcP, RegOpc(dst) );
11462   ins_pipe( fpu_reg_reg );
11463 %}
11464 
11465 
11466 // Spill to obtain 24-bit precision
11467 instruct modF24_reg(stackSlotF dst, regF src1, regF src2, eAXRegI rax, eFlagsReg cr) %{
11468   predicate( UseSSE==0 && Compile::current()->select_24_bit_instr());
11469   match(Set dst (ModF src1 src2));
11470   effect(KILL rax, KILL cr); // emitModD() uses EAX and EFLAGS
11471 
11472   format %{ "FMOD   $dst,$src1,$src2" %}
11473   ins_encode( Push_Reg_Mod_D(src1, src2),
11474               emitModD(),
11475               Push_Result_Mod_D(src2),
11476               Pop_Mem_F(dst));
11477   ins_pipe( pipe_slow );
11478 %}
11479 //
11480 // This instruction does not round to 24-bits
11481 instruct modF_reg(regF dst, regF src, eAXRegI rax, eFlagsReg cr) %{
11482   predicate( UseSSE==0 && !Compile::current()->select_24_bit_instr());
11483   match(Set dst (ModF dst src));
11484   effect(KILL rax, KILL cr); // emitModD() uses EAX and EFLAGS
11485 
11486   format %{ "FMOD   $dst,$src" %}
11487   ins_encode(Push_Reg_Mod_D(dst, src),
11488               emitModD(),
11489               Push_Result_Mod_D(src),
11490               Pop_Reg_F(dst));
11491   ins_pipe( pipe_slow );
11492 %}
11493 
11494 instruct modX_reg(regX dst, regX src0, regX src1, eAXRegI rax, eFlagsReg cr) %{
11495   predicate(UseSSE>=1);
11496   match(Set dst (ModF src0 src1));
11497   effect(KILL rax, KILL cr);
11498   format %{ "SUB    ESP,4\t # FMOD\n"
11499           "\tMOVSS  [ESP+0],$src1\n"
11500           "\tFLD_S  [ESP+0]\n"
11501           "\tMOVSS  [ESP+0],$src0\n"
11502           "\tFLD_S  [ESP+0]\n"
11503      "loop:\tFPREM\n"
11504           "\tFWAIT\n"
11505           "\tFNSTSW AX\n"
11506           "\tSAHF\n"
11507           "\tJP     loop\n"
11508           "\tFSTP_S [ESP+0]\n"
11509           "\tMOVSS  $dst,[ESP+0]\n"
11510           "\tADD    ESP,4\n"
11511           "\tFSTP   ST0\t # Restore FPU Stack"
11512     %}
11513   ins_cost(250);
11514   ins_encode( Push_ModX_encoding(src0, src1), emitModD(), Push_ResultX(dst,0x4), PopFPU);
11515   ins_pipe( pipe_slow );
11516 %}
11517 
11518 
11519 //----------Arithmetic Conversion Instructions---------------------------------
11520 // The conversions operations are all Alpha sorted.  Please keep it that way!
11521 
11522 instruct roundFloat_mem_reg(stackSlotF dst, regF src) %{
11523   predicate(UseSSE==0);
11524   match(Set dst (RoundFloat src));
11525   ins_cost(125);
11526   format %{ "FST_S  $dst,$src\t# F-round" %}
11527   ins_encode( Pop_Mem_Reg_F(dst, src) );
11528   ins_pipe( fpu_mem_reg );
11529 %}
11530 
11531 instruct roundDouble_mem_reg(stackSlotD dst, regD src) %{
11532   predicate(UseSSE<=1);
11533   match(Set dst (RoundDouble src));
11534   ins_cost(125);
11535   format %{ "FST_D  $dst,$src\t# D-round" %}
11536   ins_encode( Pop_Mem_Reg_D(dst, src) );
11537   ins_pipe( fpu_mem_reg );
11538 %}
11539 
11540 // Force rounding to 24-bit precision and 6-bit exponent
11541 instruct convD2F_reg(stackSlotF dst, regD src) %{
11542   predicate(UseSSE==0);
11543   match(Set dst (ConvD2F src));
11544   format %{ "FST_S  $dst,$src\t# F-round" %}
11545   expand %{
11546     roundFloat_mem_reg(dst,src);
11547   %}
11548 %}
11549 
11550 // Force rounding to 24-bit precision and 6-bit exponent
11551 instruct convD2X_reg(regX dst, regD src, eFlagsReg cr) %{
11552   predicate(UseSSE==1);
11553   match(Set dst (ConvD2F src));
11554   effect( KILL cr );
11555   format %{ "SUB    ESP,4\n\t"
11556             "FST_S  [ESP],$src\t# F-round\n\t"
11557             "MOVSS  $dst,[ESP]\n\t"
11558             "ADD ESP,4" %}
11559   ins_encode( D2X_encoding(dst, src) );
11560   ins_pipe( pipe_slow );
11561 %}
11562 
11563 // Force rounding double precision to single precision
11564 instruct convXD2X_reg(regX dst, regXD src) %{
11565   predicate(UseSSE>=2);
11566   match(Set dst (ConvD2F src));
11567   format %{ "CVTSD2SS $dst,$src\t# F-round" %}
11568   opcode(0xF2, 0x0F, 0x5A);
11569   ins_encode( OpcP, OpcS, Opcode(tertiary), RegReg(dst, src));
11570   ins_pipe( pipe_slow );
11571 %}
11572 
11573 instruct convF2D_reg_reg(regD dst, regF src) %{
11574   predicate(UseSSE==0);
11575   match(Set dst (ConvF2D src));
11576   format %{ "FST_S  $dst,$src\t# D-round" %}
11577   ins_encode( Pop_Reg_Reg_D(dst, src));
11578   ins_pipe( fpu_reg_reg );
11579 %}
11580 
11581 instruct convF2D_reg(stackSlotD dst, regF src) %{
11582   predicate(UseSSE==1);
11583   match(Set dst (ConvF2D src));
11584   format %{ "FST_D  $dst,$src\t# D-round" %}
11585   expand %{
11586     roundDouble_mem_reg(dst,src);
11587   %}
11588 %}
11589 
11590 instruct convX2D_reg(regD dst, regX src, eFlagsReg cr) %{
11591   predicate(UseSSE==1);
11592   match(Set dst (ConvF2D src));
11593   effect( KILL cr );
11594   format %{ "SUB    ESP,4\n\t"
11595             "MOVSS  [ESP] $src\n\t"
11596             "FLD_S  [ESP]\n\t"
11597             "ADD    ESP,4\n\t"
11598             "FSTP   $dst\t# D-round" %}
11599   ins_encode( X2D_encoding(dst, src), Pop_Reg_D(dst));
11600   ins_pipe( pipe_slow );
11601 %}
11602 
11603 instruct convX2XD_reg(regXD dst, regX src) %{
11604   predicate(UseSSE>=2);
11605   match(Set dst (ConvF2D src));
11606   format %{ "CVTSS2SD $dst,$src\t# D-round" %}
11607   opcode(0xF3, 0x0F, 0x5A);
11608   ins_encode( OpcP, OpcS, Opcode(tertiary), RegReg(dst, src));
11609   ins_pipe( pipe_slow );
11610 %}
11611 
11612 // Convert a double to an int.  If the double is a NAN, stuff a zero in instead.
11613 instruct convD2I_reg_reg( eAXRegI dst, eDXRegI tmp, regD src, eFlagsReg cr ) %{
11614   predicate(UseSSE<=1);
11615   match(Set dst (ConvD2I src));
11616   effect( KILL tmp, KILL cr );
11617   format %{ "FLD    $src\t# Convert double to int \n\t"
11618             "FLDCW  trunc mode\n\t"
11619             "SUB    ESP,4\n\t"
11620             "FISTp  [ESP + #0]\n\t"
11621             "FLDCW  std/24-bit mode\n\t"
11622             "POP    EAX\n\t"
11623             "CMP    EAX,0x80000000\n\t"
11624             "JNE,s  fast\n\t"
11625             "FLD_D  $src\n\t"
11626             "CALL   d2i_wrapper\n"
11627       "fast:" %}
11628   ins_encode( Push_Reg_D(src), D2I_encoding(src) );
11629   ins_pipe( pipe_slow );
11630 %}
11631 
11632 // Convert a double to an int.  If the double is a NAN, stuff a zero in instead.
11633 instruct convXD2I_reg_reg( eAXRegI dst, eDXRegI tmp, regXD src, eFlagsReg cr ) %{
11634   predicate(UseSSE>=2);
11635   match(Set dst (ConvD2I src));
11636   effect( KILL tmp, KILL cr );
11637   format %{ "CVTTSD2SI $dst, $src\n\t"
11638             "CMP    $dst,0x80000000\n\t"
11639             "JNE,s  fast\n\t"
11640             "SUB    ESP, 8\n\t"
11641             "MOVSD  [ESP], $src\n\t"
11642             "FLD_D  [ESP]\n\t"
11643             "ADD    ESP, 8\n\t"
11644             "CALL   d2i_wrapper\n"
11645       "fast:" %}
11646   opcode(0x1); // double-precision conversion
11647   ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x2C), FX2I_encoding(src,dst));
11648   ins_pipe( pipe_slow );
11649 %}
11650 
11651 instruct convD2L_reg_reg( eADXRegL dst, regD src, eFlagsReg cr ) %{
11652   predicate(UseSSE<=1);
11653   match(Set dst (ConvD2L src));
11654   effect( KILL cr );
11655   format %{ "FLD    $src\t# Convert double to long\n\t"
11656             "FLDCW  trunc mode\n\t"
11657             "SUB    ESP,8\n\t"
11658             "FISTp  [ESP + #0]\n\t"
11659             "FLDCW  std/24-bit mode\n\t"
11660             "POP    EAX\n\t"
11661             "POP    EDX\n\t"
11662             "CMP    EDX,0x80000000\n\t"
11663             "JNE,s  fast\n\t"
11664             "TEST   EAX,EAX\n\t"
11665             "JNE,s  fast\n\t"
11666             "FLD    $src\n\t"
11667             "CALL   d2l_wrapper\n"
11668       "fast:" %}
11669   ins_encode( Push_Reg_D(src),  D2L_encoding(src) );
11670   ins_pipe( pipe_slow );
11671 %}
11672 
11673 // XMM lacks a float/double->long conversion, so use the old FPU stack.
11674 instruct convXD2L_reg_reg( eADXRegL dst, regXD src, eFlagsReg cr ) %{
11675   predicate (UseSSE>=2);
11676   match(Set dst (ConvD2L src));
11677   effect( KILL cr );
11678   format %{ "SUB    ESP,8\t# Convert double to long\n\t"
11679             "MOVSD  [ESP],$src\n\t"
11680             "FLD_D  [ESP]\n\t"
11681             "FLDCW  trunc mode\n\t"
11682             "FISTp  [ESP + #0]\n\t"
11683             "FLDCW  std/24-bit mode\n\t"
11684             "POP    EAX\n\t"
11685             "POP    EDX\n\t"
11686             "CMP    EDX,0x80000000\n\t"
11687             "JNE,s  fast\n\t"
11688             "TEST   EAX,EAX\n\t"
11689             "JNE,s  fast\n\t"
11690             "SUB    ESP,8\n\t"
11691             "MOVSD  [ESP],$src\n\t"
11692             "FLD_D  [ESP]\n\t"
11693             "CALL   d2l_wrapper\n"
11694       "fast:" %}
11695   ins_encode( XD2L_encoding(src) );
11696   ins_pipe( pipe_slow );
11697 %}
11698 
11699 // Convert a double to an int.  Java semantics require we do complex
11700 // manglations in the corner cases.  So we set the rounding mode to
11701 // 'zero', store the darned double down as an int, and reset the
11702 // rounding mode to 'nearest'.  The hardware stores a flag value down
11703 // if we would overflow or converted a NAN; we check for this and
11704 // and go the slow path if needed.
11705 instruct convF2I_reg_reg(eAXRegI dst, eDXRegI tmp, regF src, eFlagsReg cr ) %{
11706   predicate(UseSSE==0);
11707   match(Set dst (ConvF2I src));
11708   effect( KILL tmp, KILL cr );
11709   format %{ "FLD    $src\t# Convert float to int \n\t"
11710             "FLDCW  trunc mode\n\t"
11711             "SUB    ESP,4\n\t"
11712             "FISTp  [ESP + #0]\n\t"
11713             "FLDCW  std/24-bit mode\n\t"
11714             "POP    EAX\n\t"
11715             "CMP    EAX,0x80000000\n\t"
11716             "JNE,s  fast\n\t"
11717             "FLD    $src\n\t"
11718             "CALL   d2i_wrapper\n"
11719       "fast:" %}
11720   // D2I_encoding works for F2I
11721   ins_encode( Push_Reg_F(src), D2I_encoding(src) );
11722   ins_pipe( pipe_slow );
11723 %}
11724 
11725 // Convert a float in xmm to an int reg.
11726 instruct convX2I_reg(eAXRegI dst, eDXRegI tmp, regX src, eFlagsReg cr ) %{
11727   predicate(UseSSE>=1);
11728   match(Set dst (ConvF2I src));
11729   effect( KILL tmp, KILL cr );
11730   format %{ "CVTTSS2SI $dst, $src\n\t"
11731             "CMP    $dst,0x80000000\n\t"
11732             "JNE,s  fast\n\t"
11733             "SUB    ESP, 4\n\t"
11734             "MOVSS  [ESP], $src\n\t"
11735             "FLD    [ESP]\n\t"
11736             "ADD    ESP, 4\n\t"
11737             "CALL   d2i_wrapper\n"
11738       "fast:" %}
11739   opcode(0x0); // single-precision conversion
11740   ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x2C), FX2I_encoding(src,dst));
11741   ins_pipe( pipe_slow );
11742 %}
11743 
11744 instruct convF2L_reg_reg( eADXRegL dst, regF src, eFlagsReg cr ) %{
11745   predicate(UseSSE==0);
11746   match(Set dst (ConvF2L src));
11747   effect( KILL cr );
11748   format %{ "FLD    $src\t# Convert float to long\n\t"
11749             "FLDCW  trunc mode\n\t"
11750             "SUB    ESP,8\n\t"
11751             "FISTp  [ESP + #0]\n\t"
11752             "FLDCW  std/24-bit mode\n\t"
11753             "POP    EAX\n\t"
11754             "POP    EDX\n\t"
11755             "CMP    EDX,0x80000000\n\t"
11756             "JNE,s  fast\n\t"
11757             "TEST   EAX,EAX\n\t"
11758             "JNE,s  fast\n\t"
11759             "FLD    $src\n\t"
11760             "CALL   d2l_wrapper\n"
11761       "fast:" %}
11762   // D2L_encoding works for F2L
11763   ins_encode( Push_Reg_F(src), D2L_encoding(src) );
11764   ins_pipe( pipe_slow );
11765 %}
11766 
11767 // XMM lacks a float/double->long conversion, so use the old FPU stack.
11768 instruct convX2L_reg_reg( eADXRegL dst, regX src, eFlagsReg cr ) %{
11769   predicate (UseSSE>=1);
11770   match(Set dst (ConvF2L src));
11771   effect( KILL cr );
11772   format %{ "SUB    ESP,8\t# Convert float to long\n\t"
11773             "MOVSS  [ESP],$src\n\t"
11774             "FLD_S  [ESP]\n\t"
11775             "FLDCW  trunc mode\n\t"
11776             "FISTp  [ESP + #0]\n\t"
11777             "FLDCW  std/24-bit mode\n\t"
11778             "POP    EAX\n\t"
11779             "POP    EDX\n\t"
11780             "CMP    EDX,0x80000000\n\t"
11781             "JNE,s  fast\n\t"
11782             "TEST   EAX,EAX\n\t"
11783             "JNE,s  fast\n\t"
11784             "SUB    ESP,4\t# Convert float to long\n\t"
11785             "MOVSS  [ESP],$src\n\t"
11786             "FLD_S  [ESP]\n\t"
11787             "ADD    ESP,4\n\t"
11788             "CALL   d2l_wrapper\n"
11789       "fast:" %}
11790   ins_encode( X2L_encoding(src) );
11791   ins_pipe( pipe_slow );
11792 %}
11793 
11794 instruct convI2D_reg(regD dst, stackSlotI src) %{
11795   predicate( UseSSE<=1 );
11796   match(Set dst (ConvI2D src));
11797   format %{ "FILD   $src\n\t"
11798             "FSTP   $dst" %}
11799   opcode(0xDB, 0x0);  /* DB /0 */
11800   ins_encode(Push_Mem_I(src), Pop_Reg_D(dst));
11801   ins_pipe( fpu_reg_mem );
11802 %}
11803 
11804 instruct convI2XD_reg(regXD dst, eRegI src) %{
11805   predicate( UseSSE>=2 && !UseXmmI2D );
11806   match(Set dst (ConvI2D src));
11807   format %{ "CVTSI2SD $dst,$src" %}
11808   opcode(0xF2, 0x0F, 0x2A);
11809   ins_encode( OpcP, OpcS, Opcode(tertiary), RegReg(dst, src));
11810   ins_pipe( pipe_slow );
11811 %}
11812 
11813 instruct convI2XD_mem(regXD dst, memory mem) %{
11814   predicate( UseSSE>=2 );
11815   match(Set dst (ConvI2D (LoadI mem)));
11816   format %{ "CVTSI2SD $dst,$mem" %}
11817   opcode(0xF2, 0x0F, 0x2A);
11818   ins_encode( OpcP, OpcS, Opcode(tertiary), RegMem(dst, mem));
11819   ins_pipe( pipe_slow );
11820 %}
11821 
11822 instruct convXI2XD_reg(regXD dst, eRegI src)
11823 %{
11824   predicate( UseSSE>=2 && UseXmmI2D );
11825   match(Set dst (ConvI2D src));
11826 
11827   format %{ "MOVD  $dst,$src\n\t"
11828             "CVTDQ2PD $dst,$dst\t# i2d" %}
11829   ins_encode %{
11830     __ movdl($dst$$XMMRegister, $src$$Register);
11831     __ cvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister);
11832   %}
11833   ins_pipe(pipe_slow); // XXX
11834 %}
11835 
11836 instruct convI2D_mem(regD dst, memory mem) %{
11837   predicate( UseSSE<=1 && !Compile::current()->select_24_bit_instr());
11838   match(Set dst (ConvI2D (LoadI mem)));
11839   format %{ "FILD   $mem\n\t"
11840             "FSTP   $dst" %}
11841   opcode(0xDB);      /* DB /0 */
11842   ins_encode( OpcP, RMopc_Mem(0x00,mem),
11843               Pop_Reg_D(dst));
11844   ins_pipe( fpu_reg_mem );
11845 %}
11846 
11847 // Convert a byte to a float; no rounding step needed.
11848 instruct conv24I2F_reg(regF dst, stackSlotI src) %{
11849   predicate( UseSSE==0 && n->in(1)->Opcode() == Op_AndI && n->in(1)->in(2)->is_Con() && n->in(1)->in(2)->get_int() == 255 );
11850   match(Set dst (ConvI2F src));
11851   format %{ "FILD   $src\n\t"
11852             "FSTP   $dst" %}
11853 
11854   opcode(0xDB, 0x0);  /* DB /0 */
11855   ins_encode(Push_Mem_I(src), Pop_Reg_F(dst));
11856   ins_pipe( fpu_reg_mem );
11857 %}
11858 
11859 // In 24-bit mode, force exponent rounding by storing back out
11860 instruct convI2F_SSF(stackSlotF dst, stackSlotI src) %{
11861   predicate( UseSSE==0 && Compile::current()->select_24_bit_instr());
11862   match(Set dst (ConvI2F src));
11863   ins_cost(200);
11864   format %{ "FILD   $src\n\t"
11865             "FSTP_S $dst" %}
11866   opcode(0xDB, 0x0);  /* DB /0 */
11867   ins_encode( Push_Mem_I(src),
11868               Pop_Mem_F(dst));
11869   ins_pipe( fpu_mem_mem );
11870 %}
11871 
11872 // In 24-bit mode, force exponent rounding by storing back out
11873 instruct convI2F_SSF_mem(stackSlotF dst, memory mem) %{
11874   predicate( UseSSE==0 && Compile::current()->select_24_bit_instr());
11875   match(Set dst (ConvI2F (LoadI mem)));
11876   ins_cost(200);
11877   format %{ "FILD   $mem\n\t"
11878             "FSTP_S $dst" %}
11879   opcode(0xDB);  /* DB /0 */
11880   ins_encode( OpcP, RMopc_Mem(0x00,mem),
11881               Pop_Mem_F(dst));
11882   ins_pipe( fpu_mem_mem );
11883 %}
11884 
11885 // This instruction does not round to 24-bits
11886 instruct convI2F_reg(regF dst, stackSlotI src) %{
11887   predicate( UseSSE==0 && !Compile::current()->select_24_bit_instr());
11888   match(Set dst (ConvI2F src));
11889   format %{ "FILD   $src\n\t"
11890             "FSTP   $dst" %}
11891   opcode(0xDB, 0x0);  /* DB /0 */
11892   ins_encode( Push_Mem_I(src),
11893               Pop_Reg_F(dst));
11894   ins_pipe( fpu_reg_mem );
11895 %}
11896 
11897 // This instruction does not round to 24-bits
11898 instruct convI2F_mem(regF dst, memory mem) %{
11899   predicate( UseSSE==0 && !Compile::current()->select_24_bit_instr());
11900   match(Set dst (ConvI2F (LoadI mem)));
11901   format %{ "FILD   $mem\n\t"
11902             "FSTP   $dst" %}
11903   opcode(0xDB);      /* DB /0 */
11904   ins_encode( OpcP, RMopc_Mem(0x00,mem),
11905               Pop_Reg_F(dst));
11906   ins_pipe( fpu_reg_mem );
11907 %}
11908 
11909 // Convert an int to a float in xmm; no rounding step needed.
11910 instruct convI2X_reg(regX dst, eRegI src) %{
11911   predicate( UseSSE==1 || UseSSE>=2 && !UseXmmI2F );
11912   match(Set dst (ConvI2F src));
11913   format %{ "CVTSI2SS $dst, $src" %}
11914 
11915   opcode(0xF3, 0x0F, 0x2A);  /* F3 0F 2A /r */
11916   ins_encode( OpcP, OpcS, Opcode(tertiary), RegReg(dst, src));
11917   ins_pipe( pipe_slow );
11918 %}
11919 
11920  instruct convXI2X_reg(regX dst, eRegI src)
11921 %{
11922   predicate( UseSSE>=2 && UseXmmI2F );
11923   match(Set dst (ConvI2F src));
11924 
11925   format %{ "MOVD  $dst,$src\n\t"
11926             "CVTDQ2PS $dst,$dst\t# i2f" %}
11927   ins_encode %{
11928     __ movdl($dst$$XMMRegister, $src$$Register);
11929     __ cvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister);
11930   %}
11931   ins_pipe(pipe_slow); // XXX
11932 %}
11933 
11934 instruct convI2L_reg( eRegL dst, eRegI src, eFlagsReg cr) %{
11935   match(Set dst (ConvI2L src));
11936   effect(KILL cr);
11937   ins_cost(375);
11938   format %{ "MOV    $dst.lo,$src\n\t"
11939             "MOV    $dst.hi,$src\n\t"
11940             "SAR    $dst.hi,31" %}
11941   ins_encode(convert_int_long(dst,src));
11942   ins_pipe( ialu_reg_reg_long );
11943 %}
11944 
11945 // Zero-extend convert int to long
11946 instruct convI2L_reg_zex(eRegL dst, eRegI src, immL_32bits mask, eFlagsReg flags ) %{
11947   match(Set dst (AndL (ConvI2L src) mask) );
11948   effect( KILL flags );
11949   ins_cost(250);
11950   format %{ "MOV    $dst.lo,$src\n\t"
11951             "XOR    $dst.hi,$dst.hi" %}
11952   opcode(0x33); // XOR
11953   ins_encode(enc_Copy(dst,src), OpcP, RegReg_Hi2(dst,dst) );
11954   ins_pipe( ialu_reg_reg_long );
11955 %}
11956 
11957 // Zero-extend long
11958 instruct zerox_long(eRegL dst, eRegL src, immL_32bits mask, eFlagsReg flags ) %{
11959   match(Set dst (AndL src mask) );
11960   effect( KILL flags );
11961   ins_cost(250);
11962   format %{ "MOV    $dst.lo,$src.lo\n\t"
11963             "XOR    $dst.hi,$dst.hi\n\t" %}
11964   opcode(0x33); // XOR
11965   ins_encode(enc_Copy(dst,src), OpcP, RegReg_Hi2(dst,dst) );
11966   ins_pipe( ialu_reg_reg_long );
11967 %}
11968 
11969 instruct convL2D_reg( stackSlotD dst, eRegL src, eFlagsReg cr) %{
11970   predicate (UseSSE<=1);
11971   match(Set dst (ConvL2D src));
11972   effect( KILL cr );
11973   format %{ "PUSH   $src.hi\t# Convert long to double\n\t"
11974             "PUSH   $src.lo\n\t"
11975             "FILD   ST,[ESP + #0]\n\t"
11976             "ADD    ESP,8\n\t"
11977             "FSTP_D $dst\t# D-round" %}
11978   opcode(0xDF, 0x5);  /* DF /5 */
11979   ins_encode(convert_long_double(src), Pop_Mem_D(dst));
11980   ins_pipe( pipe_slow );
11981 %}
11982 
11983 instruct convL2XD_reg( regXD dst, eRegL src, eFlagsReg cr) %{
11984   predicate (UseSSE>=2);
11985   match(Set dst (ConvL2D src));
11986   effect( KILL cr );
11987   format %{ "PUSH   $src.hi\t# Convert long to double\n\t"
11988             "PUSH   $src.lo\n\t"
11989             "FILD_D [ESP]\n\t"
11990             "FSTP_D [ESP]\n\t"
11991             "MOVSD  $dst,[ESP]\n\t"
11992             "ADD    ESP,8" %}
11993   opcode(0xDF, 0x5);  /* DF /5 */
11994   ins_encode(convert_long_double2(src), Push_ResultXD(dst));
11995   ins_pipe( pipe_slow );
11996 %}
11997 
11998 instruct convL2X_reg( regX dst, eRegL src, eFlagsReg cr) %{
11999   predicate (UseSSE>=1);
12000   match(Set dst (ConvL2F src));
12001   effect( KILL cr );
12002   format %{ "PUSH   $src.hi\t# Convert long to single float\n\t"
12003             "PUSH   $src.lo\n\t"
12004             "FILD_D [ESP]\n\t"
12005             "FSTP_S [ESP]\n\t"
12006             "MOVSS  $dst,[ESP]\n\t"
12007             "ADD    ESP,8" %}
12008   opcode(0xDF, 0x5);  /* DF /5 */
12009   ins_encode(convert_long_double2(src), Push_ResultX(dst,0x8));
12010   ins_pipe( pipe_slow );
12011 %}
12012 
12013 instruct convL2F_reg( stackSlotF dst, eRegL src, eFlagsReg cr) %{
12014   match(Set dst (ConvL2F src));
12015   effect( KILL cr );
12016   format %{ "PUSH   $src.hi\t# Convert long to single float\n\t"
12017             "PUSH   $src.lo\n\t"
12018             "FILD   ST,[ESP + #0]\n\t"
12019             "ADD    ESP,8\n\t"
12020             "FSTP_S $dst\t# F-round" %}
12021   opcode(0xDF, 0x5);  /* DF /5 */
12022   ins_encode(convert_long_double(src), Pop_Mem_F(dst));
12023   ins_pipe( pipe_slow );
12024 %}
12025 
12026 instruct convL2I_reg( eRegI dst, eRegL src ) %{
12027   match(Set dst (ConvL2I src));
12028   effect( DEF dst, USE src );
12029   format %{ "MOV    $dst,$src.lo" %}
12030   ins_encode(enc_CopyL_Lo(dst,src));
12031   ins_pipe( ialu_reg_reg );
12032 %}
12033 
12034 
12035 instruct MoveF2I_stack_reg(eRegI dst, stackSlotF src) %{
12036   match(Set dst (MoveF2I src));
12037   effect( DEF dst, USE src );
12038   ins_cost(100);
12039   format %{ "MOV    $dst,$src\t# MoveF2I_stack_reg" %}
12040   opcode(0x8B);
12041   ins_encode( OpcP, RegMem(dst,src));
12042   ins_pipe( ialu_reg_mem );
12043 %}
12044 
12045 instruct MoveF2I_reg_stack(stackSlotI dst, regF src) %{
12046   predicate(UseSSE==0);
12047   match(Set dst (MoveF2I src));
12048   effect( DEF dst, USE src );
12049 
12050   ins_cost(125);
12051   format %{ "FST_S  $dst,$src\t# MoveF2I_reg_stack" %}
12052   ins_encode( Pop_Mem_Reg_F(dst, src) );
12053   ins_pipe( fpu_mem_reg );
12054 %}
12055 
12056 instruct MoveF2I_reg_stack_sse(stackSlotI dst, regX src) %{
12057   predicate(UseSSE>=1);
12058   match(Set dst (MoveF2I src));
12059   effect( DEF dst, USE src );
12060 
12061   ins_cost(95);
12062   format %{ "MOVSS  $dst,$src\t# MoveF2I_reg_stack_sse" %}
12063   ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x11), RegMem(src, dst));
12064   ins_pipe( pipe_slow );
12065 %}
12066 
12067 instruct MoveF2I_reg_reg_sse(eRegI dst, regX src) %{
12068   predicate(UseSSE>=2);
12069   match(Set dst (MoveF2I src));
12070   effect( DEF dst, USE src );
12071   ins_cost(85);
12072   format %{ "MOVD   $dst,$src\t# MoveF2I_reg_reg_sse" %}
12073   ins_encode( MovX2I_reg(dst, src));
12074   ins_pipe( pipe_slow );
12075 %}
12076 
12077 instruct MoveI2F_reg_stack(stackSlotF dst, eRegI src) %{
12078   match(Set dst (MoveI2F src));
12079   effect( DEF dst, USE src );
12080 
12081   ins_cost(100);
12082   format %{ "MOV    $dst,$src\t# MoveI2F_reg_stack" %}
12083   opcode(0x89);
12084   ins_encode( OpcPRegSS( dst, src ) );
12085   ins_pipe( ialu_mem_reg );
12086 %}
12087 
12088 
12089 instruct MoveI2F_stack_reg(regF dst, stackSlotI src) %{
12090   predicate(UseSSE==0);
12091   match(Set dst (MoveI2F src));
12092   effect(DEF dst, USE src);
12093 
12094   ins_cost(125);
12095   format %{ "FLD_S  $src\n\t"
12096             "FSTP   $dst\t# MoveI2F_stack_reg" %}
12097   opcode(0xD9);               /* D9 /0, FLD m32real */
12098   ins_encode( OpcP, RMopc_Mem_no_oop(0x00,src),
12099               Pop_Reg_F(dst) );
12100   ins_pipe( fpu_reg_mem );
12101 %}
12102 
12103 instruct MoveI2F_stack_reg_sse(regX dst, stackSlotI src) %{
12104   predicate(UseSSE>=1);
12105   match(Set dst (MoveI2F src));
12106   effect( DEF dst, USE src );
12107 
12108   ins_cost(95);
12109   format %{ "MOVSS  $dst,$src\t# MoveI2F_stack_reg_sse" %}
12110   ins_encode( Opcode(0xF3), Opcode(0x0F), Opcode(0x10), RegMem(dst,src));
12111   ins_pipe( pipe_slow );
12112 %}
12113 
12114 instruct MoveI2F_reg_reg_sse(regX dst, eRegI src) %{
12115   predicate(UseSSE>=2);
12116   match(Set dst (MoveI2F src));
12117   effect( DEF dst, USE src );
12118 
12119   ins_cost(85);
12120   format %{ "MOVD   $dst,$src\t# MoveI2F_reg_reg_sse" %}
12121   ins_encode( MovI2X_reg(dst, src) );
12122   ins_pipe( pipe_slow );
12123 %}
12124 
12125 instruct MoveD2L_stack_reg(eRegL dst, stackSlotD src) %{
12126   match(Set dst (MoveD2L src));
12127   effect(DEF dst, USE src);
12128 
12129   ins_cost(250);
12130   format %{ "MOV    $dst.lo,$src\n\t"
12131             "MOV    $dst.hi,$src+4\t# MoveD2L_stack_reg" %}
12132   opcode(0x8B, 0x8B);
12133   ins_encode( OpcP, RegMem(dst,src), OpcS, RegMem_Hi(dst,src));
12134   ins_pipe( ialu_mem_long_reg );
12135 %}
12136 
12137 instruct MoveD2L_reg_stack(stackSlotL dst, regD src) %{
12138   predicate(UseSSE<=1);
12139   match(Set dst (MoveD2L src));
12140   effect(DEF dst, USE src);
12141 
12142   ins_cost(125);
12143   format %{ "FST_D  $dst,$src\t# MoveD2L_reg_stack" %}
12144   ins_encode( Pop_Mem_Reg_D(dst, src) );
12145   ins_pipe( fpu_mem_reg );
12146 %}
12147 
12148 instruct MoveD2L_reg_stack_sse(stackSlotL dst, regXD src) %{
12149   predicate(UseSSE>=2);
12150   match(Set dst (MoveD2L src));
12151   effect(DEF dst, USE src);
12152   ins_cost(95);
12153 
12154   format %{ "MOVSD  $dst,$src\t# MoveD2L_reg_stack_sse" %}
12155   ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x11), RegMem(src,dst));
12156   ins_pipe( pipe_slow );
12157 %}
12158 
12159 instruct MoveD2L_reg_reg_sse(eRegL dst, regXD src, regXD tmp) %{
12160   predicate(UseSSE>=2);
12161   match(Set dst (MoveD2L src));
12162   effect(DEF dst, USE src, TEMP tmp);
12163   ins_cost(85);
12164   format %{ "MOVD   $dst.lo,$src\n\t"
12165             "PSHUFLW $tmp,$src,0x4E\n\t"
12166             "MOVD   $dst.hi,$tmp\t# MoveD2L_reg_reg_sse" %}
12167   ins_encode( MovXD2L_reg(dst, src, tmp) );
12168   ins_pipe( pipe_slow );
12169 %}
12170 
12171 instruct MoveL2D_reg_stack(stackSlotD dst, eRegL src) %{
12172   match(Set dst (MoveL2D src));
12173   effect(DEF dst, USE src);
12174 
12175   ins_cost(200);
12176   format %{ "MOV    $dst,$src.lo\n\t"
12177             "MOV    $dst+4,$src.hi\t# MoveL2D_reg_stack" %}
12178   opcode(0x89, 0x89);
12179   ins_encode( OpcP, RegMem( src, dst ), OpcS, RegMem_Hi( src, dst ) );
12180   ins_pipe( ialu_mem_long_reg );
12181 %}
12182 
12183 
12184 instruct MoveL2D_stack_reg(regD dst, stackSlotL src) %{
12185   predicate(UseSSE<=1);
12186   match(Set dst (MoveL2D src));
12187   effect(DEF dst, USE src);
12188   ins_cost(125);
12189 
12190   format %{ "FLD_D  $src\n\t"
12191             "FSTP   $dst\t# MoveL2D_stack_reg" %}
12192   opcode(0xDD);               /* DD /0, FLD m64real */
12193   ins_encode( OpcP, RMopc_Mem_no_oop(0x00,src),
12194               Pop_Reg_D(dst) );
12195   ins_pipe( fpu_reg_mem );
12196 %}
12197 
12198 
12199 instruct MoveL2D_stack_reg_sse(regXD dst, stackSlotL src) %{
12200   predicate(UseSSE>=2 && UseXmmLoadAndClearUpper);
12201   match(Set dst (MoveL2D src));
12202   effect(DEF dst, USE src);
12203 
12204   ins_cost(95);
12205   format %{ "MOVSD  $dst,$src\t# MoveL2D_stack_reg_sse" %}
12206   ins_encode( Opcode(0xF2), Opcode(0x0F), Opcode(0x10), RegMem(dst,src));
12207   ins_pipe( pipe_slow );
12208 %}
12209 
12210 instruct MoveL2D_stack_reg_sse_partial(regXD dst, stackSlotL src) %{
12211   predicate(UseSSE>=2 && !UseXmmLoadAndClearUpper);
12212   match(Set dst (MoveL2D src));
12213   effect(DEF dst, USE src);
12214 
12215   ins_cost(95);
12216   format %{ "MOVLPD $dst,$src\t# MoveL2D_stack_reg_sse" %}
12217   ins_encode( Opcode(0x66), Opcode(0x0F), Opcode(0x12), RegMem(dst,src));
12218   ins_pipe( pipe_slow );
12219 %}
12220 
12221 instruct MoveL2D_reg_reg_sse(regXD dst, eRegL src, regXD tmp) %{
12222   predicate(UseSSE>=2);
12223   match(Set dst (MoveL2D src));
12224   effect(TEMP dst, USE src, TEMP tmp);
12225   ins_cost(85);
12226   format %{ "MOVD   $dst,$src.lo\n\t"
12227             "MOVD   $tmp,$src.hi\n\t"
12228             "PUNPCKLDQ $dst,$tmp\t# MoveL2D_reg_reg_sse" %}
12229   ins_encode( MovL2XD_reg(dst, src, tmp) );
12230   ins_pipe( pipe_slow );
12231 %}
12232 
12233 // Replicate scalar to packed byte (1 byte) values in xmm
12234 instruct Repl8B_reg(regXD dst, regXD src) %{
12235   predicate(UseSSE>=2);
12236   match(Set dst (Replicate8B src));
12237   format %{ "MOVDQA  $dst,$src\n\t"
12238             "PUNPCKLBW $dst,$dst\n\t"
12239             "PSHUFLW $dst,$dst,0x00\t! replicate8B" %}
12240   ins_encode( pshufd_8x8(dst, src));
12241   ins_pipe( pipe_slow );
12242 %}
12243 
12244 // Replicate scalar to packed byte (1 byte) values in xmm
12245 instruct Repl8B_eRegI(regXD dst, eRegI src) %{
12246   predicate(UseSSE>=2);
12247   match(Set dst (Replicate8B src));
12248   format %{ "MOVD    $dst,$src\n\t"
12249             "PUNPCKLBW $dst,$dst\n\t"
12250             "PSHUFLW $dst,$dst,0x00\t! replicate8B" %}
12251   ins_encode( mov_i2x(dst, src), pshufd_8x8(dst, dst));
12252   ins_pipe( pipe_slow );
12253 %}
12254 
12255 // Replicate scalar zero to packed byte (1 byte) values in xmm
12256 instruct Repl8B_immI0(regXD dst, immI0 zero) %{
12257   predicate(UseSSE>=2);
12258   match(Set dst (Replicate8B zero));
12259   format %{ "PXOR  $dst,$dst\t! replicate8B" %}
12260   ins_encode( pxor(dst, dst));
12261   ins_pipe( fpu_reg_reg );
12262 %}
12263 
12264 // Replicate scalar to packed shore (2 byte) values in xmm
12265 instruct Repl4S_reg(regXD dst, regXD src) %{
12266   predicate(UseSSE>=2);
12267   match(Set dst (Replicate4S src));
12268   format %{ "PSHUFLW $dst,$src,0x00\t! replicate4S" %}
12269   ins_encode( pshufd_4x16(dst, src));
12270   ins_pipe( fpu_reg_reg );
12271 %}
12272 
12273 // Replicate scalar to packed shore (2 byte) values in xmm
12274 instruct Repl4S_eRegI(regXD dst, eRegI src) %{
12275   predicate(UseSSE>=2);
12276   match(Set dst (Replicate4S src));
12277   format %{ "MOVD    $dst,$src\n\t"
12278             "PSHUFLW $dst,$dst,0x00\t! replicate4S" %}
12279   ins_encode( mov_i2x(dst, src), pshufd_4x16(dst, dst));
12280   ins_pipe( fpu_reg_reg );
12281 %}
12282 
12283 // Replicate scalar zero to packed short (2 byte) values in xmm
12284 instruct Repl4S_immI0(regXD dst, immI0 zero) %{
12285   predicate(UseSSE>=2);
12286   match(Set dst (Replicate4S zero));
12287   format %{ "PXOR  $dst,$dst\t! replicate4S" %}
12288   ins_encode( pxor(dst, dst));
12289   ins_pipe( fpu_reg_reg );
12290 %}
12291 
12292 // Replicate scalar to packed char (2 byte) values in xmm
12293 instruct Repl4C_reg(regXD dst, regXD src) %{
12294   predicate(UseSSE>=2);
12295   match(Set dst (Replicate4C src));
12296   format %{ "PSHUFLW $dst,$src,0x00\t! replicate4C" %}
12297   ins_encode( pshufd_4x16(dst, src));
12298   ins_pipe( fpu_reg_reg );
12299 %}
12300 
12301 // Replicate scalar to packed char (2 byte) values in xmm
12302 instruct Repl4C_eRegI(regXD dst, eRegI src) %{
12303   predicate(UseSSE>=2);
12304   match(Set dst (Replicate4C src));
12305   format %{ "MOVD    $dst,$src\n\t"
12306             "PSHUFLW $dst,$dst,0x00\t! replicate4C" %}
12307   ins_encode( mov_i2x(dst, src), pshufd_4x16(dst, dst));
12308   ins_pipe( fpu_reg_reg );
12309 %}
12310 
12311 // Replicate scalar zero to packed char (2 byte) values in xmm
12312 instruct Repl4C_immI0(regXD dst, immI0 zero) %{
12313   predicate(UseSSE>=2);
12314   match(Set dst (Replicate4C zero));
12315   format %{ "PXOR  $dst,$dst\t! replicate4C" %}
12316   ins_encode( pxor(dst, dst));
12317   ins_pipe( fpu_reg_reg );
12318 %}
12319 
12320 // Replicate scalar to packed integer (4 byte) values in xmm
12321 instruct Repl2I_reg(regXD dst, regXD src) %{
12322   predicate(UseSSE>=2);
12323   match(Set dst (Replicate2I src));
12324   format %{ "PSHUFD $dst,$src,0x00\t! replicate2I" %}
12325   ins_encode( pshufd(dst, src, 0x00));
12326   ins_pipe( fpu_reg_reg );
12327 %}
12328 
12329 // Replicate scalar to packed integer (4 byte) values in xmm
12330 instruct Repl2I_eRegI(regXD dst, eRegI src) %{
12331   predicate(UseSSE>=2);
12332   match(Set dst (Replicate2I src));
12333   format %{ "MOVD   $dst,$src\n\t"
12334             "PSHUFD $dst,$dst,0x00\t! replicate2I" %}
12335   ins_encode( mov_i2x(dst, src), pshufd(dst, dst, 0x00));
12336   ins_pipe( fpu_reg_reg );
12337 %}
12338 
12339 // Replicate scalar zero to packed integer (2 byte) values in xmm
12340 instruct Repl2I_immI0(regXD dst, immI0 zero) %{
12341   predicate(UseSSE>=2);
12342   match(Set dst (Replicate2I zero));
12343   format %{ "PXOR  $dst,$dst\t! replicate2I" %}
12344   ins_encode( pxor(dst, dst));
12345   ins_pipe( fpu_reg_reg );
12346 %}
12347 
12348 // Replicate scalar to packed single precision floating point values in xmm
12349 instruct Repl2F_reg(regXD dst, regXD src) %{
12350   predicate(UseSSE>=2);
12351   match(Set dst (Replicate2F src));
12352   format %{ "PSHUFD $dst,$src,0xe0\t! replicate2F" %}
12353   ins_encode( pshufd(dst, src, 0xe0));
12354   ins_pipe( fpu_reg_reg );
12355 %}
12356 
12357 // Replicate scalar to packed single precision floating point values in xmm
12358 instruct Repl2F_regX(regXD dst, regX src) %{
12359   predicate(UseSSE>=2);
12360   match(Set dst (Replicate2F src));
12361   format %{ "PSHUFD $dst,$src,0xe0\t! replicate2F" %}
12362   ins_encode( pshufd(dst, src, 0xe0));
12363   ins_pipe( fpu_reg_reg );
12364 %}
12365 
12366 // Replicate scalar to packed single precision floating point values in xmm
12367 instruct Repl2F_immXF0(regXD dst, immXF0 zero) %{
12368   predicate(UseSSE>=2);
12369   match(Set dst (Replicate2F zero));
12370   format %{ "PXOR  $dst,$dst\t! replicate2F" %}
12371   ins_encode( pxor(dst, dst));
12372   ins_pipe( fpu_reg_reg );
12373 %}
12374 
12375 // =======================================================================
12376 // fast clearing of an array
12377 instruct rep_stos(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
12378   match(Set dummy (ClearArray cnt base));
12379   effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
12380   format %{ "SHL    ECX,1\t# Convert doublewords to words\n\t"
12381             "XOR    EAX,EAX\n\t"
12382             "REP STOS\t# store EAX into [EDI++] while ECX--" %}
12383   opcode(0,0x4);
12384   ins_encode( Opcode(0xD1), RegOpc(ECX),
12385               OpcRegReg(0x33,EAX,EAX),
12386               Opcode(0xF3), Opcode(0xAB) );
12387   ins_pipe( pipe_slow );
12388 %}
12389 
12390 instruct string_compare(eDIRegP str1, eCXRegI cnt1, eSIRegP str2, eBXRegI cnt2,
12391                         eAXRegI result, regXD tmp1, regXD tmp2, eFlagsReg cr) %{
12392   match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
12393   effect(TEMP tmp1, TEMP tmp2, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);
12394 
12395   format %{ "String Compare $str1,$cnt1,$str2,$cnt2 -> $result   // KILL $tmp1, $tmp2" %}
12396   ins_encode %{
12397     __ string_compare($str1$$Register, $str2$$Register,
12398                       $cnt1$$Register, $cnt2$$Register, $result$$Register,
12399                       $tmp1$$XMMRegister, $tmp2$$XMMRegister);
12400   %}
12401   ins_pipe( pipe_slow );
12402 %}
12403 
12404 // fast string equals
12405 instruct string_equals(eDIRegP str1, eSIRegP str2, eCXRegI cnt, eAXRegI result,
12406                        regXD tmp1, regXD tmp2, eBXRegI tmp3, eFlagsReg cr) %{
12407   match(Set result (StrEquals (Binary str1 str2) cnt));
12408   effect(TEMP tmp1, TEMP tmp2, USE_KILL str1, USE_KILL str2, USE_KILL cnt, KILL tmp3, KILL cr);
12409 
12410   format %{ "String Equals $str1,$str2,$cnt -> $result    // KILL $tmp1, $tmp2, $tmp3" %}
12411   ins_encode %{
12412     __ char_arrays_equals(false, $str1$$Register, $str2$$Register,
12413                           $cnt$$Register, $result$$Register, $tmp3$$Register,
12414                           $tmp1$$XMMRegister, $tmp2$$XMMRegister);
12415   %}
12416   ins_pipe( pipe_slow );
12417 %}
12418 
12419 instruct string_indexof(eDIRegP str1, eDXRegI cnt1, eSIRegP str2, eAXRegI cnt2,
12420                         eBXRegI result, regXD tmp1, eCXRegI tmp2, eFlagsReg cr) %{
12421   predicate(UseSSE42Intrinsics);
12422   match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2)));
12423   effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL tmp2, KILL cr);
12424 
12425   format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result   // KILL $tmp2, $tmp1" %}
12426   ins_encode %{
12427     __ string_indexof($str1$$Register, $str2$$Register,
12428                       $cnt1$$Register, $cnt2$$Register, $result$$Register,
12429                       $tmp1$$XMMRegister, $tmp2$$Register);
12430   %}
12431   ins_pipe( pipe_slow );
12432 %}
12433 
12434 // fast array equals
12435 instruct array_equals(eDIRegP ary1, eSIRegP ary2, eAXRegI result,
12436                       regXD tmp1, regXD tmp2, eCXRegI tmp3, eBXRegI tmp4, eFlagsReg cr)
12437 %{
12438   match(Set result (AryEq ary1 ary2));
12439   effect(TEMP tmp1, TEMP tmp2, USE_KILL ary1, USE_KILL ary2, KILL tmp3, KILL tmp4, KILL cr);
12440   //ins_cost(300);
12441 
12442   format %{ "Array Equals $ary1,$ary2 -> $result   // KILL $tmp1, $tmp2, $tmp3, $tmp4" %}
12443   ins_encode %{
12444     __ char_arrays_equals(true, $ary1$$Register, $ary2$$Register,
12445                           $tmp3$$Register, $result$$Register, $tmp4$$Register,
12446                           $tmp1$$XMMRegister, $tmp2$$XMMRegister);
12447   %}
12448   ins_pipe( pipe_slow );
12449 %}
12450 
12451 //----------Control Flow Instructions------------------------------------------
12452 // Signed compare Instructions
12453 instruct compI_eReg(eFlagsReg cr, eRegI op1, eRegI op2) %{
12454   match(Set cr (CmpI op1 op2));
12455   effect( DEF cr, USE op1, USE op2 );
12456   format %{ "CMP    $op1,$op2" %}
12457   opcode(0x3B);  /* Opcode 3B /r */
12458   ins_encode( OpcP, RegReg( op1, op2) );
12459   ins_pipe( ialu_cr_reg_reg );
12460 %}
12461 
12462 instruct compI_eReg_imm(eFlagsReg cr, eRegI op1, immI op2) %{
12463   match(Set cr (CmpI op1 op2));
12464   effect( DEF cr, USE op1 );
12465   format %{ "CMP    $op1,$op2" %}
12466   opcode(0x81,0x07);  /* Opcode 81 /7 */
12467   // ins_encode( RegImm( op1, op2) );  /* Was CmpImm */
12468   ins_encode( OpcSErm( op1, op2 ), Con8or32( op2 ) );
12469   ins_pipe( ialu_cr_reg_imm );
12470 %}
12471 
12472 // Cisc-spilled version of cmpI_eReg
12473 instruct compI_eReg_mem(eFlagsReg cr, eRegI op1, memory op2) %{
12474   match(Set cr (CmpI op1 (LoadI op2)));
12475 
12476   format %{ "CMP    $op1,$op2" %}
12477   ins_cost(500);
12478   opcode(0x3B);  /* Opcode 3B /r */
12479   ins_encode( OpcP, RegMem( op1, op2) );
12480   ins_pipe( ialu_cr_reg_mem );
12481 %}
12482 
12483 instruct testI_reg( eFlagsReg cr, eRegI src, immI0 zero ) %{
12484   match(Set cr (CmpI src zero));
12485   effect( DEF cr, USE src );
12486 
12487   format %{ "TEST   $src,$src" %}
12488   opcode(0x85);
12489   ins_encode( OpcP, RegReg( src, src ) );
12490   ins_pipe( ialu_cr_reg_imm );
12491 %}
12492 
12493 instruct testI_reg_imm( eFlagsReg cr, eRegI src, immI con, immI0 zero ) %{
12494   match(Set cr (CmpI (AndI src con) zero));
12495 
12496   format %{ "TEST   $src,$con" %}
12497   opcode(0xF7,0x00);
12498   ins_encode( OpcP, RegOpc(src), Con32(con) );
12499   ins_pipe( ialu_cr_reg_imm );
12500 %}
12501 
12502 instruct testI_reg_mem( eFlagsReg cr, eRegI src, memory mem, immI0 zero ) %{
12503   match(Set cr (CmpI (AndI src mem) zero));
12504 
12505   format %{ "TEST   $src,$mem" %}
12506   opcode(0x85);
12507   ins_encode( OpcP, RegMem( src, mem ) );
12508   ins_pipe( ialu_cr_reg_mem );
12509 %}
12510 
12511 // Unsigned compare Instructions; really, same as signed except they
12512 // produce an eFlagsRegU instead of eFlagsReg.
12513 instruct compU_eReg(eFlagsRegU cr, eRegI op1, eRegI op2) %{
12514   match(Set cr (CmpU op1 op2));
12515 
12516   format %{ "CMPu   $op1,$op2" %}
12517   opcode(0x3B);  /* Opcode 3B /r */
12518   ins_encode( OpcP, RegReg( op1, op2) );
12519   ins_pipe( ialu_cr_reg_reg );
12520 %}
12521 
12522 instruct compU_eReg_imm(eFlagsRegU cr, eRegI op1, immI op2) %{
12523   match(Set cr (CmpU op1 op2));
12524 
12525   format %{ "CMPu   $op1,$op2" %}
12526   opcode(0x81,0x07);  /* Opcode 81 /7 */
12527   ins_encode( OpcSErm( op1, op2 ), Con8or32( op2 ) );
12528   ins_pipe( ialu_cr_reg_imm );
12529 %}
12530 
12531 // // Cisc-spilled version of cmpU_eReg
12532 instruct compU_eReg_mem(eFlagsRegU cr, eRegI op1, memory op2) %{
12533   match(Set cr (CmpU op1 (LoadI op2)));
12534 
12535   format %{ "CMPu   $op1,$op2" %}
12536   ins_cost(500);
12537   opcode(0x3B);  /* Opcode 3B /r */
12538   ins_encode( OpcP, RegMem( op1, op2) );
12539   ins_pipe( ialu_cr_reg_mem );
12540 %}
12541 
12542 // // Cisc-spilled version of cmpU_eReg
12543 //instruct compU_mem_eReg(eFlagsRegU cr, memory op1, eRegI op2) %{
12544 //  match(Set cr (CmpU (LoadI op1) op2));
12545 //
12546 //  format %{ "CMPu   $op1,$op2" %}
12547 //  ins_cost(500);
12548 //  opcode(0x39);  /* Opcode 39 /r */
12549 //  ins_encode( OpcP, RegMem( op1, op2) );
12550 //%}
12551 
12552 instruct testU_reg( eFlagsRegU cr, eRegI src, immI0 zero ) %{
12553   match(Set cr (CmpU src zero));
12554 
12555   format %{ "TESTu  $src,$src" %}
12556   opcode(0x85);
12557   ins_encode( OpcP, RegReg( src, src ) );
12558   ins_pipe( ialu_cr_reg_imm );
12559 %}
12560 
12561 // Unsigned pointer compare Instructions
12562 instruct compP_eReg(eFlagsRegU cr, eRegP op1, eRegP op2) %{
12563   match(Set cr (CmpP op1 op2));
12564 
12565   format %{ "CMPu   $op1,$op2" %}
12566   opcode(0x3B);  /* Opcode 3B /r */
12567   ins_encode( OpcP, RegReg( op1, op2) );
12568   ins_pipe( ialu_cr_reg_reg );
12569 %}
12570 
12571 instruct compP_eReg_imm(eFlagsRegU cr, eRegP op1, immP op2) %{
12572   match(Set cr (CmpP op1 op2));
12573 
12574   format %{ "CMPu   $op1,$op2" %}
12575   opcode(0x81,0x07);  /* Opcode 81 /7 */
12576   ins_encode( OpcSErm( op1, op2 ), Con8or32( op2 ) );
12577   ins_pipe( ialu_cr_reg_imm );
12578 %}
12579 
12580 // // Cisc-spilled version of cmpP_eReg
12581 instruct compP_eReg_mem(eFlagsRegU cr, eRegP op1, memory op2) %{
12582   match(Set cr (CmpP op1 (LoadP op2)));
12583 
12584   format %{ "CMPu   $op1,$op2" %}
12585   ins_cost(500);
12586   opcode(0x3B);  /* Opcode 3B /r */
12587   ins_encode( OpcP, RegMem( op1, op2) );
12588   ins_pipe( ialu_cr_reg_mem );
12589 %}
12590 
12591 // // Cisc-spilled version of cmpP_eReg
12592 //instruct compP_mem_eReg(eFlagsRegU cr, memory op1, eRegP op2) %{
12593 //  match(Set cr (CmpP (LoadP op1) op2));
12594 //
12595 //  format %{ "CMPu   $op1,$op2" %}
12596 //  ins_cost(500);
12597 //  opcode(0x39);  /* Opcode 39 /r */
12598 //  ins_encode( OpcP, RegMem( op1, op2) );
12599 //%}
12600 
12601 // Compare raw pointer (used in out-of-heap check).
12602 // Only works because non-oop pointers must be raw pointers
12603 // and raw pointers have no anti-dependencies.
12604 instruct compP_mem_eReg( eFlagsRegU cr, eRegP op1, memory op2 ) %{
12605   predicate( !n->in(2)->in(2)->bottom_type()->isa_oop_ptr() );
12606   match(Set cr (CmpP op1 (LoadP op2)));
12607 
12608   format %{ "CMPu   $op1,$op2" %}
12609   opcode(0x3B);  /* Opcode 3B /r */
12610   ins_encode( OpcP, RegMem( op1, op2) );
12611   ins_pipe( ialu_cr_reg_mem );
12612 %}
12613 
12614 //
12615 // This will generate a signed flags result. This should be ok
12616 // since any compare to a zero should be eq/neq.
12617 instruct testP_reg( eFlagsReg cr, eRegP src, immP0 zero ) %{
12618   match(Set cr (CmpP src zero));
12619 
12620   format %{ "TEST   $src,$src" %}
12621   opcode(0x85);
12622   ins_encode( OpcP, RegReg( src, src ) );
12623   ins_pipe( ialu_cr_reg_imm );
12624 %}
12625 
12626 // Cisc-spilled version of testP_reg
12627 // This will generate a signed flags result. This should be ok
12628 // since any compare to a zero should be eq/neq.
12629 instruct testP_Reg_mem( eFlagsReg cr, memory op, immI0 zero ) %{
12630   match(Set cr (CmpP (LoadP op) zero));
12631 
12632   format %{ "TEST   $op,0xFFFFFFFF" %}
12633   ins_cost(500);
12634   opcode(0xF7);               /* Opcode F7 /0 */
12635   ins_encode( OpcP, RMopc_Mem(0x00,op), Con_d32(0xFFFFFFFF) );
12636   ins_pipe( ialu_cr_reg_imm );
12637 %}
12638 
12639 // Yanked all unsigned pointer compare operations.
12640 // Pointer compares are done with CmpP which is already unsigned.
12641 
12642 //----------Max and Min--------------------------------------------------------
12643 // Min Instructions
12644 ////
12645 //   *** Min and Max using the conditional move are slower than the
12646 //   *** branch version on a Pentium III.
12647 // // Conditional move for min
12648 //instruct cmovI_reg_lt( eRegI op2, eRegI op1, eFlagsReg cr ) %{
12649 //  effect( USE_DEF op2, USE op1, USE cr );
12650 //  format %{ "CMOVlt $op2,$op1\t! min" %}
12651 //  opcode(0x4C,0x0F);
12652 //  ins_encode( OpcS, OpcP, RegReg( op2, op1 ) );
12653 //  ins_pipe( pipe_cmov_reg );
12654 //%}
12655 //
12656 //// Min Register with Register (P6 version)
12657 //instruct minI_eReg_p6( eRegI op1, eRegI op2 ) %{
12658 //  predicate(VM_Version::supports_cmov() );
12659 //  match(Set op2 (MinI op1 op2));
12660 //  ins_cost(200);
12661 //  expand %{
12662 //    eFlagsReg cr;
12663 //    compI_eReg(cr,op1,op2);
12664 //    cmovI_reg_lt(op2,op1,cr);
12665 //  %}
12666 //%}
12667 
12668 // Min Register with Register (generic version)
12669 instruct minI_eReg(eRegI dst, eRegI src, eFlagsReg flags) %{
12670   match(Set dst (MinI dst src));
12671   effect(KILL flags);
12672   ins_cost(300);
12673 
12674   format %{ "MIN    $dst,$src" %}
12675   opcode(0xCC);
12676   ins_encode( min_enc(dst,src) );
12677   ins_pipe( pipe_slow );
12678 %}
12679 
12680 // Max Register with Register
12681 //   *** Min and Max using the conditional move are slower than the
12682 //   *** branch version on a Pentium III.
12683 // // Conditional move for max
12684 //instruct cmovI_reg_gt( eRegI op2, eRegI op1, eFlagsReg cr ) %{
12685 //  effect( USE_DEF op2, USE op1, USE cr );
12686 //  format %{ "CMOVgt $op2,$op1\t! max" %}
12687 //  opcode(0x4F,0x0F);
12688 //  ins_encode( OpcS, OpcP, RegReg( op2, op1 ) );
12689 //  ins_pipe( pipe_cmov_reg );
12690 //%}
12691 //
12692 // // Max Register with Register (P6 version)
12693 //instruct maxI_eReg_p6( eRegI op1, eRegI op2 ) %{
12694 //  predicate(VM_Version::supports_cmov() );
12695 //  match(Set op2 (MaxI op1 op2));
12696 //  ins_cost(200);
12697 //  expand %{
12698 //    eFlagsReg cr;
12699 //    compI_eReg(cr,op1,op2);
12700 //    cmovI_reg_gt(op2,op1,cr);
12701 //  %}
12702 //%}
12703 
12704 // Max Register with Register (generic version)
12705 instruct maxI_eReg(eRegI dst, eRegI src, eFlagsReg flags) %{
12706   match(Set dst (MaxI dst src));
12707   effect(KILL flags);
12708   ins_cost(300);
12709 
12710   format %{ "MAX    $dst,$src" %}
12711   opcode(0xCC);
12712   ins_encode( max_enc(dst,src) );
12713   ins_pipe( pipe_slow );
12714 %}
12715 
12716 // ============================================================================
12717 // Branch Instructions
12718 // Jump Table
12719 instruct jumpXtnd(eRegI switch_val) %{
12720   match(Jump switch_val);
12721   ins_cost(350);
12722 
12723   format %{  "JMP    [table_base](,$switch_val,1)\n\t" %}
12724 
12725   ins_encode %{
12726     address table_base  = __ address_table_constant(_index2label);
12727 
12728     // Jump to Address(table_base + switch_reg)
12729     InternalAddress table(table_base);
12730     Address index(noreg, $switch_val$$Register, Address::times_1);
12731     __ jump(ArrayAddress(table, index));
12732   %}
12733   ins_pc_relative(1);
12734   ins_pipe(pipe_jmp);
12735 %}
12736 
12737 // Jump Direct - Label defines a relative address from JMP+1
12738 instruct jmpDir(label labl) %{
12739   match(Goto);
12740   effect(USE labl);
12741 
12742   ins_cost(300);
12743   format %{ "JMP    $labl" %}
12744   size(5);
12745   opcode(0xE9);
12746   ins_encode( OpcP, Lbl( labl ) );
12747   ins_pipe( pipe_jmp );
12748   ins_pc_relative(1);
12749 %}
12750 
12751 // Jump Direct Conditional - Label defines a relative address from Jcc+1
12752 instruct jmpCon(cmpOp cop, eFlagsReg cr, label labl) %{
12753   match(If cop cr);
12754   effect(USE labl);
12755 
12756   ins_cost(300);
12757   format %{ "J$cop    $labl" %}
12758   size(6);
12759   opcode(0x0F, 0x80);
12760   ins_encode( Jcc( cop, labl) );
12761   ins_pipe( pipe_jcc );
12762   ins_pc_relative(1);
12763 %}
12764 
12765 // Jump Direct Conditional - Label defines a relative address from Jcc+1
12766 instruct jmpLoopEnd(cmpOp cop, eFlagsReg cr, label labl) %{
12767   match(CountedLoopEnd cop cr);
12768   effect(USE labl);
12769 
12770   ins_cost(300);
12771   format %{ "J$cop    $labl\t# Loop end" %}
12772   size(6);
12773   opcode(0x0F, 0x80);
12774   ins_encode( Jcc( cop, labl) );
12775   ins_pipe( pipe_jcc );
12776   ins_pc_relative(1);
12777 %}
12778 
12779 // Jump Direct Conditional - Label defines a relative address from Jcc+1
12780 instruct jmpLoopEndU(cmpOpU cop, eFlagsRegU cmp, label labl) %{
12781   match(CountedLoopEnd cop cmp);
12782   effect(USE labl);
12783 
12784   ins_cost(300);
12785   format %{ "J$cop,u  $labl\t# Loop end" %}
12786   size(6);
12787   opcode(0x0F, 0x80);
12788   ins_encode( Jcc( cop, labl) );
12789   ins_pipe( pipe_jcc );
12790   ins_pc_relative(1);
12791 %}
12792 
12793 instruct jmpLoopEndUCF(cmpOpUCF cop, eFlagsRegUCF cmp, label labl) %{
12794   match(CountedLoopEnd cop cmp);
12795   effect(USE labl);
12796 
12797   ins_cost(200);
12798   format %{ "J$cop,u  $labl\t# Loop end" %}
12799   size(6);
12800   opcode(0x0F, 0x80);
12801   ins_encode( Jcc( cop, labl) );
12802   ins_pipe( pipe_jcc );
12803   ins_pc_relative(1);
12804 %}
12805 
12806 // Jump Direct Conditional - using unsigned comparison
12807 instruct jmpConU(cmpOpU cop, eFlagsRegU cmp, label labl) %{
12808   match(If cop cmp);
12809   effect(USE labl);
12810 
12811   ins_cost(300);
12812   format %{ "J$cop,u  $labl" %}
12813   size(6);
12814   opcode(0x0F, 0x80);
12815   ins_encode(Jcc(cop, labl));
12816   ins_pipe(pipe_jcc);
12817   ins_pc_relative(1);
12818 %}
12819 
12820 instruct jmpConUCF(cmpOpUCF cop, eFlagsRegUCF cmp, label labl) %{
12821   match(If cop cmp);
12822   effect(USE labl);
12823 
12824   ins_cost(200);
12825   format %{ "J$cop,u  $labl" %}
12826   size(6);
12827   opcode(0x0F, 0x80);
12828   ins_encode(Jcc(cop, labl));
12829   ins_pipe(pipe_jcc);
12830   ins_pc_relative(1);
12831 %}
12832 
12833 instruct jmpConUCF2(cmpOpUCF2 cop, eFlagsRegUCF cmp, label labl) %{
12834   match(If cop cmp);
12835   effect(USE labl);
12836 
12837   ins_cost(200);
12838   format %{ $$template
12839     if ($cop$$cmpcode == Assembler::notEqual) {
12840       $$emit$$"JP,u   $labl\n\t"
12841       $$emit$$"J$cop,u   $labl"
12842     } else {
12843       $$emit$$"JP,u   done\n\t"
12844       $$emit$$"J$cop,u   $labl\n\t"
12845       $$emit$$"done:"
12846     }
12847   %}
12848   size(12);
12849   opcode(0x0F, 0x80);
12850   ins_encode %{
12851     Label* l = $labl$$label;
12852     $$$emit8$primary;
12853     emit_cc(cbuf, $secondary, Assembler::parity);
12854     int parity_disp = -1;
12855     bool ok = false;
12856     if ($cop$$cmpcode == Assembler::notEqual) {
12857        // the two jumps 6 bytes apart so the jump distances are too
12858        parity_disp = l ? (l->loc_pos() - (cbuf.code_size() + 4)) : 0;
12859     } else if ($cop$$cmpcode == Assembler::equal) {
12860        parity_disp = 6;
12861        ok = true;
12862     } else {
12863        ShouldNotReachHere();
12864     }
12865     emit_d32(cbuf, parity_disp);
12866     $$$emit8$primary;
12867     emit_cc(cbuf, $secondary, $cop$$cmpcode);
12868     int disp = l ? (l->loc_pos() - (cbuf.code_size() + 4)) : 0;
12869     emit_d32(cbuf, disp);
12870   %}
12871   ins_pipe(pipe_jcc);
12872   ins_pc_relative(1);
12873 %}
12874 
12875 // ============================================================================
12876 // The 2nd slow-half of a subtype check.  Scan the subklass's 2ndary superklass
12877 // array for an instance of the superklass.  Set a hidden internal cache on a
12878 // hit (cache is checked with exposed code in gen_subtype_check()).  Return
12879 // NZ for a miss or zero for a hit.  The encoding ALSO sets flags.
12880 instruct partialSubtypeCheck( eDIRegP result, eSIRegP sub, eAXRegP super, eCXRegI rcx, eFlagsReg cr ) %{
12881   match(Set result (PartialSubtypeCheck sub super));
12882   effect( KILL rcx, KILL cr );
12883 
12884   ins_cost(1100);  // slightly larger than the next version
12885   format %{ "MOV    EDI,[$sub+Klass::secondary_supers]\n\t"
12886             "MOV    ECX,[EDI+arrayKlass::length]\t# length to scan\n\t"
12887             "ADD    EDI,arrayKlass::base_offset\t# Skip to start of data; set NZ in case count is zero\n\t"
12888             "REPNE SCASD\t# Scan *EDI++ for a match with EAX while CX-- != 0\n\t"
12889             "JNE,s  miss\t\t# Missed: EDI not-zero\n\t"
12890             "MOV    [$sub+Klass::secondary_super_cache],$super\t# Hit: update cache\n\t"
12891             "XOR    $result,$result\t\t Hit: EDI zero\n\t"
12892      "miss:\t" %}
12893 
12894   opcode(0x1); // Force a XOR of EDI
12895   ins_encode( enc_PartialSubtypeCheck() );
12896   ins_pipe( pipe_slow );
12897 %}
12898 
12899 instruct partialSubtypeCheck_vs_Zero( eFlagsReg cr, eSIRegP sub, eAXRegP super, eCXRegI rcx, eDIRegP result, immP0 zero ) %{
12900   match(Set cr (CmpP (PartialSubtypeCheck sub super) zero));
12901   effect( KILL rcx, KILL result );
12902 
12903   ins_cost(1000);
12904   format %{ "MOV    EDI,[$sub+Klass::secondary_supers]\n\t"
12905             "MOV    ECX,[EDI+arrayKlass::length]\t# length to scan\n\t"
12906             "ADD    EDI,arrayKlass::base_offset\t# Skip to start of data; set NZ in case count is zero\n\t"
12907             "REPNE SCASD\t# Scan *EDI++ for a match with EAX while CX-- != 0\n\t"
12908             "JNE,s  miss\t\t# Missed: flags NZ\n\t"
12909             "MOV    [$sub+Klass::secondary_super_cache],$super\t# Hit: update cache, flags Z\n\t"
12910      "miss:\t" %}
12911 
12912   opcode(0x0);  // No need to XOR EDI
12913   ins_encode( enc_PartialSubtypeCheck() );
12914   ins_pipe( pipe_slow );
12915 %}
12916 
12917 // ============================================================================
12918 // Branch Instructions -- short offset versions
12919 //
12920 // These instructions are used to replace jumps of a long offset (the default
12921 // match) with jumps of a shorter offset.  These instructions are all tagged
12922 // with the ins_short_branch attribute, which causes the ADLC to suppress the
12923 // match rules in general matching.  Instead, the ADLC generates a conversion
12924 // method in the MachNode which can be used to do in-place replacement of the
12925 // long variant with the shorter variant.  The compiler will determine if a
12926 // branch can be taken by the is_short_branch_offset() predicate in the machine
12927 // specific code section of the file.
12928 
12929 // Jump Direct - Label defines a relative address from JMP+1
12930 instruct jmpDir_short(label labl) %{
12931   match(Goto);
12932   effect(USE labl);
12933 
12934   ins_cost(300);
12935   format %{ "JMP,s  $labl" %}
12936   size(2);
12937   opcode(0xEB);
12938   ins_encode( OpcP, LblShort( labl ) );
12939   ins_pipe( pipe_jmp );
12940   ins_pc_relative(1);
12941   ins_short_branch(1);
12942 %}
12943 
12944 // Jump Direct Conditional - Label defines a relative address from Jcc+1
12945 instruct jmpCon_short(cmpOp cop, eFlagsReg cr, label labl) %{
12946   match(If cop cr);
12947   effect(USE labl);
12948 
12949   ins_cost(300);
12950   format %{ "J$cop,s  $labl" %}
12951   size(2);
12952   opcode(0x70);
12953   ins_encode( JccShort( cop, labl) );
12954   ins_pipe( pipe_jcc );
12955   ins_pc_relative(1);
12956   ins_short_branch(1);
12957 %}
12958 
12959 // Jump Direct Conditional - Label defines a relative address from Jcc+1
12960 instruct jmpLoopEnd_short(cmpOp cop, eFlagsReg cr, label labl) %{
12961   match(CountedLoopEnd cop cr);
12962   effect(USE labl);
12963 
12964   ins_cost(300);
12965   format %{ "J$cop,s  $labl\t# Loop end" %}
12966   size(2);
12967   opcode(0x70);
12968   ins_encode( JccShort( cop, labl) );
12969   ins_pipe( pipe_jcc );
12970   ins_pc_relative(1);
12971   ins_short_branch(1);
12972 %}
12973 
12974 // Jump Direct Conditional - Label defines a relative address from Jcc+1
12975 instruct jmpLoopEndU_short(cmpOpU cop, eFlagsRegU cmp, label labl) %{
12976   match(CountedLoopEnd cop cmp);
12977   effect(USE labl);
12978 
12979   ins_cost(300);
12980   format %{ "J$cop,us $labl\t# Loop end" %}
12981   size(2);
12982   opcode(0x70);
12983   ins_encode( JccShort( cop, labl) );
12984   ins_pipe( pipe_jcc );
12985   ins_pc_relative(1);
12986   ins_short_branch(1);
12987 %}
12988 
12989 instruct jmpLoopEndUCF_short(cmpOpUCF cop, eFlagsRegUCF cmp, label labl) %{
12990   match(CountedLoopEnd cop cmp);
12991   effect(USE labl);
12992 
12993   ins_cost(300);
12994   format %{ "J$cop,us $labl\t# Loop end" %}
12995   size(2);
12996   opcode(0x70);
12997   ins_encode( JccShort( cop, labl) );
12998   ins_pipe( pipe_jcc );
12999   ins_pc_relative(1);
13000   ins_short_branch(1);
13001 %}
13002 
13003 // Jump Direct Conditional - using unsigned comparison
13004 instruct jmpConU_short(cmpOpU cop, eFlagsRegU cmp, label labl) %{
13005   match(If cop cmp);
13006   effect(USE labl);
13007 
13008   ins_cost(300);
13009   format %{ "J$cop,us $labl" %}
13010   size(2);
13011   opcode(0x70);
13012   ins_encode( JccShort( cop, labl) );
13013   ins_pipe( pipe_jcc );
13014   ins_pc_relative(1);
13015   ins_short_branch(1);
13016 %}
13017 
13018 instruct jmpConUCF_short(cmpOpUCF cop, eFlagsRegUCF cmp, label labl) %{
13019   match(If cop cmp);
13020   effect(USE labl);
13021 
13022   ins_cost(300);
13023   format %{ "J$cop,us $labl" %}
13024   size(2);
13025   opcode(0x70);
13026   ins_encode( JccShort( cop, labl) );
13027   ins_pipe( pipe_jcc );
13028   ins_pc_relative(1);
13029   ins_short_branch(1);
13030 %}
13031 
13032 instruct jmpConUCF2_short(cmpOpUCF2 cop, eFlagsRegUCF cmp, label labl) %{
13033   match(If cop cmp);
13034   effect(USE labl);
13035 
13036   ins_cost(300);
13037   format %{ $$template
13038     if ($cop$$cmpcode == Assembler::notEqual) {
13039       $$emit$$"JP,u,s   $labl\n\t"
13040       $$emit$$"J$cop,u,s   $labl"
13041     } else {
13042       $$emit$$"JP,u,s   done\n\t"
13043       $$emit$$"J$cop,u,s  $labl\n\t"
13044       $$emit$$"done:"
13045     }
13046   %}
13047   size(4);
13048   opcode(0x70);
13049   ins_encode %{
13050     Label* l = $labl$$label;
13051     emit_cc(cbuf, $primary, Assembler::parity);
13052     int parity_disp = -1;
13053     if ($cop$$cmpcode == Assembler::notEqual) {
13054       parity_disp = l ? (l->loc_pos() - (cbuf.code_size() + 1)) : 0;
13055     } else if ($cop$$cmpcode == Assembler::equal) {
13056       parity_disp = 2;
13057     } else {
13058       ShouldNotReachHere();
13059     }
13060     emit_d8(cbuf, parity_disp);
13061     emit_cc(cbuf, $primary, $cop$$cmpcode);
13062     int disp = l ? (l->loc_pos() - (cbuf.code_size() + 1)) : 0;
13063     emit_d8(cbuf, disp);
13064     assert(-128 <= disp && disp <= 127, "Displacement too large for short jmp");
13065     assert(-128 <= parity_disp && parity_disp <= 127, "Displacement too large for short jmp");
13066   %}
13067   ins_pipe(pipe_jcc);
13068   ins_pc_relative(1);
13069   ins_short_branch(1);
13070 %}
13071 
13072 // ============================================================================
13073 // Long Compare
13074 //
13075 // Currently we hold longs in 2 registers.  Comparing such values efficiently
13076 // is tricky.  The flavor of compare used depends on whether we are testing
13077 // for LT, LE, or EQ.  For a simple LT test we can check just the sign bit.
13078 // The GE test is the negated LT test.  The LE test can be had by commuting
13079 // the operands (yielding a GE test) and then negating; negate again for the
13080 // GT test.  The EQ test is done by ORcc'ing the high and low halves, and the
13081 // NE test is negated from that.
13082 
13083 // Due to a shortcoming in the ADLC, it mixes up expressions like:
13084 // (foo (CmpI (CmpL X Y) 0)) and (bar (CmpI (CmpL X 0L) 0)).  Note the
13085 // difference between 'Y' and '0L'.  The tree-matches for the CmpI sections
13086 // are collapsed internally in the ADLC's dfa-gen code.  The match for
13087 // (CmpI (CmpL X Y) 0) is silently replaced with (CmpI (CmpL X 0L) 0) and the
13088 // foo match ends up with the wrong leaf.  One fix is to not match both
13089 // reg-reg and reg-zero forms of long-compare.  This is unfortunate because
13090 // both forms beat the trinary form of long-compare and both are very useful
13091 // on Intel which has so few registers.
13092 
13093 // Manifest a CmpL result in an integer register.  Very painful.
13094 // This is the test to avoid.
13095 instruct cmpL3_reg_reg(eSIRegI dst, eRegL src1, eRegL src2, eFlagsReg flags ) %{
13096   match(Set dst (CmpL3 src1 src2));
13097   effect( KILL flags );
13098   ins_cost(1000);
13099   format %{ "XOR    $dst,$dst\n\t"
13100             "CMP    $src1.hi,$src2.hi\n\t"
13101             "JLT,s  m_one\n\t"
13102             "JGT,s  p_one\n\t"
13103             "CMP    $src1.lo,$src2.lo\n\t"
13104             "JB,s   m_one\n\t"
13105             "JEQ,s  done\n"
13106     "p_one:\tINC    $dst\n\t"
13107             "JMP,s  done\n"
13108     "m_one:\tDEC    $dst\n"
13109      "done:" %}
13110   ins_encode %{
13111     Label p_one, m_one, done;
13112     __ xorptr($dst$$Register, $dst$$Register);
13113     __ cmpl(HIGH_FROM_LOW($src1$$Register), HIGH_FROM_LOW($src2$$Register));
13114     __ jccb(Assembler::less,    m_one);
13115     __ jccb(Assembler::greater, p_one);
13116     __ cmpl($src1$$Register, $src2$$Register);
13117     __ jccb(Assembler::below,   m_one);
13118     __ jccb(Assembler::equal,   done);
13119     __ bind(p_one);
13120     __ incrementl($dst$$Register);
13121     __ jmpb(done);
13122     __ bind(m_one);
13123     __ decrementl($dst$$Register);
13124     __ bind(done);
13125   %}
13126   ins_pipe( pipe_slow );
13127 %}
13128 
13129 //======
13130 // Manifest a CmpL result in the normal flags.  Only good for LT or GE
13131 // compares.  Can be used for LE or GT compares by reversing arguments.
13132 // NOT GOOD FOR EQ/NE tests.
13133 instruct cmpL_zero_flags_LTGE( flagsReg_long_LTGE flags, eRegL src, immL0 zero ) %{
13134   match( Set flags (CmpL src zero ));
13135   ins_cost(100);
13136   format %{ "TEST   $src.hi,$src.hi" %}
13137   opcode(0x85);
13138   ins_encode( OpcP, RegReg_Hi2( src, src ) );
13139   ins_pipe( ialu_cr_reg_reg );
13140 %}
13141 
13142 // Manifest a CmpL result in the normal flags.  Only good for LT or GE
13143 // compares.  Can be used for LE or GT compares by reversing arguments.
13144 // NOT GOOD FOR EQ/NE tests.
13145 instruct cmpL_reg_flags_LTGE( flagsReg_long_LTGE flags, eRegL src1, eRegL src2, eRegI tmp ) %{
13146   match( Set flags (CmpL src1 src2 ));
13147   effect( TEMP tmp );
13148   ins_cost(300);
13149   format %{ "CMP    $src1.lo,$src2.lo\t! Long compare; set flags for low bits\n\t"
13150             "MOV    $tmp,$src1.hi\n\t"
13151             "SBB    $tmp,$src2.hi\t! Compute flags for long compare" %}
13152   ins_encode( long_cmp_flags2( src1, src2, tmp ) );
13153   ins_pipe( ialu_cr_reg_reg );
13154 %}
13155 
13156 // Long compares reg < zero/req OR reg >= zero/req.
13157 // Just a wrapper for a normal branch, plus the predicate test.
13158 instruct cmpL_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, label labl) %{
13159   match(If cmp flags);
13160   effect(USE labl);
13161   predicate( _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
13162   expand %{
13163     jmpCon(cmp,flags,labl);    // JLT or JGE...
13164   %}
13165 %}
13166 
13167 // Compare 2 longs and CMOVE longs.
13168 instruct cmovLL_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, eRegL dst, eRegL src) %{
13169   match(Set dst (CMoveL (Binary cmp flags) (Binary dst src)));
13170   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge ));
13171   ins_cost(400);
13172   format %{ "CMOV$cmp $dst.lo,$src.lo\n\t"
13173             "CMOV$cmp $dst.hi,$src.hi" %}
13174   opcode(0x0F,0x40);
13175   ins_encode( enc_cmov(cmp), RegReg_Lo2( dst, src ), enc_cmov(cmp), RegReg_Hi2( dst, src ) );
13176   ins_pipe( pipe_cmov_reg_long );
13177 %}
13178 
13179 instruct cmovLL_mem_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, eRegL dst, load_long_memory src) %{
13180   match(Set dst (CMoveL (Binary cmp flags) (Binary dst (LoadL src))));
13181   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge ));
13182   ins_cost(500);
13183   format %{ "CMOV$cmp $dst.lo,$src.lo\n\t"
13184             "CMOV$cmp $dst.hi,$src.hi" %}
13185   opcode(0x0F,0x40);
13186   ins_encode( enc_cmov(cmp), RegMem(dst, src), enc_cmov(cmp), RegMem_Hi(dst, src) );
13187   ins_pipe( pipe_cmov_reg_long );
13188 %}
13189 
13190 // Compare 2 longs and CMOVE ints.
13191 instruct cmovII_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, eRegI dst, eRegI src) %{
13192   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge ));
13193   match(Set dst (CMoveI (Binary cmp flags) (Binary dst src)));
13194   ins_cost(200);
13195   format %{ "CMOV$cmp $dst,$src" %}
13196   opcode(0x0F,0x40);
13197   ins_encode( enc_cmov(cmp), RegReg( dst, src ) );
13198   ins_pipe( pipe_cmov_reg );
13199 %}
13200 
13201 instruct cmovII_mem_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, eRegI dst, memory src) %{
13202   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge ));
13203   match(Set dst (CMoveI (Binary cmp flags) (Binary dst (LoadI src))));
13204   ins_cost(250);
13205   format %{ "CMOV$cmp $dst,$src" %}
13206   opcode(0x0F,0x40);
13207   ins_encode( enc_cmov(cmp), RegMem( dst, src ) );
13208   ins_pipe( pipe_cmov_mem );
13209 %}
13210 
13211 // Compare 2 longs and CMOVE ints.
13212 instruct cmovPP_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, eRegP dst, eRegP src) %{
13213   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge ));
13214   match(Set dst (CMoveP (Binary cmp flags) (Binary dst src)));
13215   ins_cost(200);
13216   format %{ "CMOV$cmp $dst,$src" %}
13217   opcode(0x0F,0x40);
13218   ins_encode( enc_cmov(cmp), RegReg( dst, src ) );
13219   ins_pipe( pipe_cmov_reg );
13220 %}
13221 
13222 // Compare 2 longs and CMOVE doubles
13223 instruct cmovDD_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, regD dst, regD src) %{
13224   predicate( UseSSE<=1 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
13225   match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
13226   ins_cost(200);
13227   expand %{
13228     fcmovD_regS(cmp,flags,dst,src);
13229   %}
13230 %}
13231 
13232 // Compare 2 longs and CMOVE doubles
13233 instruct cmovXDD_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, regXD dst, regXD src) %{
13234   predicate( UseSSE>=2 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
13235   match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
13236   ins_cost(200);
13237   expand %{
13238     fcmovXD_regS(cmp,flags,dst,src);
13239   %}
13240 %}
13241 
13242 instruct cmovFF_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, regF dst, regF src) %{
13243   predicate( UseSSE==0 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
13244   match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
13245   ins_cost(200);
13246   expand %{
13247     fcmovF_regS(cmp,flags,dst,src);
13248   %}
13249 %}
13250 
13251 instruct cmovXX_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, regX dst, regX src) %{
13252   predicate( UseSSE>=1 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
13253   match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
13254   ins_cost(200);
13255   expand %{
13256     fcmovX_regS(cmp,flags,dst,src);
13257   %}
13258 %}
13259 
13260 //======
13261 // Manifest a CmpL result in the normal flags.  Only good for EQ/NE compares.
13262 instruct cmpL_zero_flags_EQNE( flagsReg_long_EQNE flags, eRegL src, immL0 zero, eRegI tmp ) %{
13263   match( Set flags (CmpL src zero ));
13264   effect(TEMP tmp);
13265   ins_cost(200);
13266   format %{ "MOV    $tmp,$src.lo\n\t"
13267             "OR     $tmp,$src.hi\t! Long is EQ/NE 0?" %}
13268   ins_encode( long_cmp_flags0( src, tmp ) );
13269   ins_pipe( ialu_reg_reg_long );
13270 %}
13271 
13272 // Manifest a CmpL result in the normal flags.  Only good for EQ/NE compares.
13273 instruct cmpL_reg_flags_EQNE( flagsReg_long_EQNE flags, eRegL src1, eRegL src2 ) %{
13274   match( Set flags (CmpL src1 src2 ));
13275   ins_cost(200+300);
13276   format %{ "CMP    $src1.lo,$src2.lo\t! Long compare; set flags for low bits\n\t"
13277             "JNE,s  skip\n\t"
13278             "CMP    $src1.hi,$src2.hi\n\t"
13279      "skip:\t" %}
13280   ins_encode( long_cmp_flags1( src1, src2 ) );
13281   ins_pipe( ialu_cr_reg_reg );
13282 %}
13283 
13284 // Long compare reg == zero/reg OR reg != zero/reg
13285 // Just a wrapper for a normal branch, plus the predicate test.
13286 instruct cmpL_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, label labl) %{
13287   match(If cmp flags);
13288   effect(USE labl);
13289   predicate( _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
13290   expand %{
13291     jmpCon(cmp,flags,labl);    // JEQ or JNE...
13292   %}
13293 %}
13294 
13295 // Compare 2 longs and CMOVE longs.
13296 instruct cmovLL_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, eRegL dst, eRegL src) %{
13297   match(Set dst (CMoveL (Binary cmp flags) (Binary dst src)));
13298   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ));
13299   ins_cost(400);
13300   format %{ "CMOV$cmp $dst.lo,$src.lo\n\t"
13301             "CMOV$cmp $dst.hi,$src.hi" %}
13302   opcode(0x0F,0x40);
13303   ins_encode( enc_cmov(cmp), RegReg_Lo2( dst, src ), enc_cmov(cmp), RegReg_Hi2( dst, src ) );
13304   ins_pipe( pipe_cmov_reg_long );
13305 %}
13306 
13307 instruct cmovLL_mem_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, eRegL dst, load_long_memory src) %{
13308   match(Set dst (CMoveL (Binary cmp flags) (Binary dst (LoadL src))));
13309   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ));
13310   ins_cost(500);
13311   format %{ "CMOV$cmp $dst.lo,$src.lo\n\t"
13312             "CMOV$cmp $dst.hi,$src.hi" %}
13313   opcode(0x0F,0x40);
13314   ins_encode( enc_cmov(cmp), RegMem(dst, src), enc_cmov(cmp), RegMem_Hi(dst, src) );
13315   ins_pipe( pipe_cmov_reg_long );
13316 %}
13317 
13318 // Compare 2 longs and CMOVE ints.
13319 instruct cmovII_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, eRegI dst, eRegI src) %{
13320   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ));
13321   match(Set dst (CMoveI (Binary cmp flags) (Binary dst src)));
13322   ins_cost(200);
13323   format %{ "CMOV$cmp $dst,$src" %}
13324   opcode(0x0F,0x40);
13325   ins_encode( enc_cmov(cmp), RegReg( dst, src ) );
13326   ins_pipe( pipe_cmov_reg );
13327 %}
13328 
13329 instruct cmovII_mem_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, eRegI dst, memory src) %{
13330   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ));
13331   match(Set dst (CMoveI (Binary cmp flags) (Binary dst (LoadI src))));
13332   ins_cost(250);
13333   format %{ "CMOV$cmp $dst,$src" %}
13334   opcode(0x0F,0x40);
13335   ins_encode( enc_cmov(cmp), RegMem( dst, src ) );
13336   ins_pipe( pipe_cmov_mem );
13337 %}
13338 
13339 // Compare 2 longs and CMOVE ints.
13340 instruct cmovPP_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, eRegP dst, eRegP src) %{
13341   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ));
13342   match(Set dst (CMoveP (Binary cmp flags) (Binary dst src)));
13343   ins_cost(200);
13344   format %{ "CMOV$cmp $dst,$src" %}
13345   opcode(0x0F,0x40);
13346   ins_encode( enc_cmov(cmp), RegReg( dst, src ) );
13347   ins_pipe( pipe_cmov_reg );
13348 %}
13349 
13350 // Compare 2 longs and CMOVE doubles
13351 instruct cmovDD_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, regD dst, regD src) %{
13352   predicate( UseSSE<=1 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
13353   match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
13354   ins_cost(200);
13355   expand %{
13356     fcmovD_regS(cmp,flags,dst,src);
13357   %}
13358 %}
13359 
13360 // Compare 2 longs and CMOVE doubles
13361 instruct cmovXDD_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, regXD dst, regXD src) %{
13362   predicate( UseSSE>=2 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
13363   match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
13364   ins_cost(200);
13365   expand %{
13366     fcmovXD_regS(cmp,flags,dst,src);
13367   %}
13368 %}
13369 
13370 instruct cmovFF_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, regF dst, regF src) %{
13371   predicate( UseSSE==0 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
13372   match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
13373   ins_cost(200);
13374   expand %{
13375     fcmovF_regS(cmp,flags,dst,src);
13376   %}
13377 %}
13378 
13379 instruct cmovXX_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, regX dst, regX src) %{
13380   predicate( UseSSE>=1 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
13381   match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
13382   ins_cost(200);
13383   expand %{
13384     fcmovX_regS(cmp,flags,dst,src);
13385   %}
13386 %}
13387 
13388 //======
13389 // Manifest a CmpL result in the normal flags.  Only good for LE or GT compares.
13390 // Same as cmpL_reg_flags_LEGT except must negate src
13391 instruct cmpL_zero_flags_LEGT( flagsReg_long_LEGT flags, eRegL src, immL0 zero, eRegI tmp ) %{
13392   match( Set flags (CmpL src zero ));
13393   effect( TEMP tmp );
13394   ins_cost(300);
13395   format %{ "XOR    $tmp,$tmp\t# Long compare for -$src < 0, use commuted test\n\t"
13396             "CMP    $tmp,$src.lo\n\t"
13397             "SBB    $tmp,$src.hi\n\t" %}
13398   ins_encode( long_cmp_flags3(src, tmp) );
13399   ins_pipe( ialu_reg_reg_long );
13400 %}
13401 
13402 // Manifest a CmpL result in the normal flags.  Only good for LE or GT compares.
13403 // Same as cmpL_reg_flags_LTGE except operands swapped.  Swapping operands
13404 // requires a commuted test to get the same result.
13405 instruct cmpL_reg_flags_LEGT( flagsReg_long_LEGT flags, eRegL src1, eRegL src2, eRegI tmp ) %{
13406   match( Set flags (CmpL src1 src2 ));
13407   effect( TEMP tmp );
13408   ins_cost(300);
13409   format %{ "CMP    $src2.lo,$src1.lo\t! Long compare, swapped operands, use with commuted test\n\t"
13410             "MOV    $tmp,$src2.hi\n\t"
13411             "SBB    $tmp,$src1.hi\t! Compute flags for long compare" %}
13412   ins_encode( long_cmp_flags2( src2, src1, tmp ) );
13413   ins_pipe( ialu_cr_reg_reg );
13414 %}
13415 
13416 // Long compares reg < zero/req OR reg >= zero/req.
13417 // Just a wrapper for a normal branch, plus the predicate test
13418 instruct cmpL_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, label labl) %{
13419   match(If cmp flags);
13420   effect(USE labl);
13421   predicate( _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt || _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le );
13422   ins_cost(300);
13423   expand %{
13424     jmpCon(cmp,flags,labl);    // JGT or JLE...
13425   %}
13426 %}
13427 
13428 // Compare 2 longs and CMOVE longs.
13429 instruct cmovLL_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, eRegL dst, eRegL src) %{
13430   match(Set dst (CMoveL (Binary cmp flags) (Binary dst src)));
13431   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt ));
13432   ins_cost(400);
13433   format %{ "CMOV$cmp $dst.lo,$src.lo\n\t"
13434             "CMOV$cmp $dst.hi,$src.hi" %}
13435   opcode(0x0F,0x40);
13436   ins_encode( enc_cmov(cmp), RegReg_Lo2( dst, src ), enc_cmov(cmp), RegReg_Hi2( dst, src ) );
13437   ins_pipe( pipe_cmov_reg_long );
13438 %}
13439 
13440 instruct cmovLL_mem_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, eRegL dst, load_long_memory src) %{
13441   match(Set dst (CMoveL (Binary cmp flags) (Binary dst (LoadL src))));
13442   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt ));
13443   ins_cost(500);
13444   format %{ "CMOV$cmp $dst.lo,$src.lo\n\t"
13445             "CMOV$cmp $dst.hi,$src.hi+4" %}
13446   opcode(0x0F,0x40);
13447   ins_encode( enc_cmov(cmp), RegMem(dst, src), enc_cmov(cmp), RegMem_Hi(dst, src) );
13448   ins_pipe( pipe_cmov_reg_long );
13449 %}
13450 
13451 // Compare 2 longs and CMOVE ints.
13452 instruct cmovII_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, eRegI dst, eRegI src) %{
13453   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt ));
13454   match(Set dst (CMoveI (Binary cmp flags) (Binary dst src)));
13455   ins_cost(200);
13456   format %{ "CMOV$cmp $dst,$src" %}
13457   opcode(0x0F,0x40);
13458   ins_encode( enc_cmov(cmp), RegReg( dst, src ) );
13459   ins_pipe( pipe_cmov_reg );
13460 %}
13461 
13462 instruct cmovII_mem_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, eRegI dst, memory src) %{
13463   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt ));
13464   match(Set dst (CMoveI (Binary cmp flags) (Binary dst (LoadI src))));
13465   ins_cost(250);
13466   format %{ "CMOV$cmp $dst,$src" %}
13467   opcode(0x0F,0x40);
13468   ins_encode( enc_cmov(cmp), RegMem( dst, src ) );
13469   ins_pipe( pipe_cmov_mem );
13470 %}
13471 
13472 // Compare 2 longs and CMOVE ptrs.
13473 instruct cmovPP_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, eRegP dst, eRegP src) %{
13474   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt ));
13475   match(Set dst (CMoveP (Binary cmp flags) (Binary dst src)));
13476   ins_cost(200);
13477   format %{ "CMOV$cmp $dst,$src" %}
13478   opcode(0x0F,0x40);
13479   ins_encode( enc_cmov(cmp), RegReg( dst, src ) );
13480   ins_pipe( pipe_cmov_reg );
13481 %}
13482 
13483 // Compare 2 longs and CMOVE doubles
13484 instruct cmovDD_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, regD dst, regD src) %{
13485   predicate( UseSSE<=1 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
13486   match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
13487   ins_cost(200);
13488   expand %{
13489     fcmovD_regS(cmp,flags,dst,src);
13490   %}
13491 %}
13492 
13493 // Compare 2 longs and CMOVE doubles
13494 instruct cmovXDD_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, regXD dst, regXD src) %{
13495   predicate( UseSSE>=2 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
13496   match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
13497   ins_cost(200);
13498   expand %{
13499     fcmovXD_regS(cmp,flags,dst,src);
13500   %}
13501 %}
13502 
13503 instruct cmovFF_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, regF dst, regF src) %{
13504   predicate( UseSSE==0 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
13505   match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
13506   ins_cost(200);
13507   expand %{
13508     fcmovF_regS(cmp,flags,dst,src);
13509   %}
13510 %}
13511 
13512 
13513 instruct cmovXX_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, regX dst, regX src) %{
13514   predicate( UseSSE>=1 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
13515   match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
13516   ins_cost(200);
13517   expand %{
13518     fcmovX_regS(cmp,flags,dst,src);
13519   %}
13520 %}
13521 
13522 
13523 // ============================================================================
13524 // Procedure Call/Return Instructions
13525 // Call Java Static Instruction
13526 // Note: If this code changes, the corresponding ret_addr_offset() and
13527 //       compute_padding() functions will have to be adjusted.
13528 instruct CallStaticJavaDirect(method meth) %{
13529   match(CallStaticJava);
13530   predicate(! ((CallStaticJavaNode*)n)->is_method_handle_invoke());
13531   effect(USE meth);
13532 
13533   ins_cost(300);
13534   format %{ "CALL,static " %}
13535   opcode(0xE8); /* E8 cd */
13536   ins_encode( pre_call_FPU,
13537               Java_Static_Call( meth ),
13538               call_epilog,
13539               post_call_FPU );
13540   ins_pipe( pipe_slow );
13541   ins_pc_relative(1);
13542   ins_alignment(4);
13543 %}
13544 
13545 // Call Java Static Instruction (method handle version)
13546 // Note: If this code changes, the corresponding ret_addr_offset() and
13547 //       compute_padding() functions will have to be adjusted.
13548 instruct CallStaticJavaHandle(method meth, eBPRegP ebp) %{
13549   match(CallStaticJava);
13550   predicate(((CallStaticJavaNode*)n)->is_method_handle_invoke());
13551   effect(USE meth);
13552   // EBP is saved by all callees (for interpreter stack correction).
13553   // We use it here for a similar purpose, in {preserve,restore}_SP.
13554 
13555   ins_cost(300);
13556   format %{ "CALL,static/MethodHandle " %}
13557   opcode(0xE8); /* E8 cd */
13558   ins_encode( pre_call_FPU,
13559               preserve_SP,
13560               Java_Static_Call( meth ),
13561               restore_SP,
13562               call_epilog,
13563               post_call_FPU );
13564   ins_pipe( pipe_slow );
13565   ins_pc_relative(1);
13566   ins_alignment(4);
13567 %}
13568 
13569 // Call Java Dynamic Instruction
13570 // Note: If this code changes, the corresponding ret_addr_offset() and
13571 //       compute_padding() functions will have to be adjusted.
13572 instruct CallDynamicJavaDirect(method meth) %{
13573   match(CallDynamicJava);
13574   effect(USE meth);
13575 
13576   ins_cost(300);
13577   format %{ "MOV    EAX,(oop)-1\n\t"
13578             "CALL,dynamic" %}
13579   opcode(0xE8); /* E8 cd */
13580   ins_encode( pre_call_FPU,
13581               Java_Dynamic_Call( meth ),
13582               call_epilog,
13583               post_call_FPU );
13584   ins_pipe( pipe_slow );
13585   ins_pc_relative(1);
13586   ins_alignment(4);
13587 %}
13588 
13589 // Call Runtime Instruction
13590 instruct CallRuntimeDirect(method meth) %{
13591   match(CallRuntime );
13592   effect(USE meth);
13593 
13594   ins_cost(300);
13595   format %{ "CALL,runtime " %}
13596   opcode(0xE8); /* E8 cd */
13597   // Use FFREEs to clear entries in float stack
13598   ins_encode( pre_call_FPU,
13599               FFree_Float_Stack_All,
13600               Java_To_Runtime( meth ),
13601               post_call_FPU );
13602   ins_pipe( pipe_slow );
13603   ins_pc_relative(1);
13604 %}
13605 
13606 // Call runtime without safepoint
13607 instruct CallLeafDirect(method meth) %{
13608   match(CallLeaf);
13609   effect(USE meth);
13610 
13611   ins_cost(300);
13612   format %{ "CALL_LEAF,runtime " %}
13613   opcode(0xE8); /* E8 cd */
13614   ins_encode( pre_call_FPU,
13615               FFree_Float_Stack_All,
13616               Java_To_Runtime( meth ),
13617               Verify_FPU_For_Leaf, post_call_FPU );
13618   ins_pipe( pipe_slow );
13619   ins_pc_relative(1);
13620 %}
13621 
13622 instruct CallLeafNoFPDirect(method meth) %{
13623   match(CallLeafNoFP);
13624   effect(USE meth);
13625 
13626   ins_cost(300);
13627   format %{ "CALL_LEAF_NOFP,runtime " %}
13628   opcode(0xE8); /* E8 cd */
13629   ins_encode(Java_To_Runtime(meth));
13630   ins_pipe( pipe_slow );
13631   ins_pc_relative(1);
13632 %}
13633 
13634 
13635 // Return Instruction
13636 // Remove the return address & jump to it.
13637 instruct Ret() %{
13638   match(Return);
13639   format %{ "RET" %}
13640   opcode(0xC3);
13641   ins_encode(OpcP);
13642   ins_pipe( pipe_jmp );
13643 %}
13644 
13645 // Tail Call; Jump from runtime stub to Java code.
13646 // Also known as an 'interprocedural jump'.
13647 // Target of jump will eventually return to caller.
13648 // TailJump below removes the return address.
13649 instruct TailCalljmpInd(eRegP_no_EBP jump_target, eBXRegP method_oop) %{
13650   match(TailCall jump_target method_oop );
13651   ins_cost(300);
13652   format %{ "JMP    $jump_target \t# EBX holds method oop" %}
13653   opcode(0xFF, 0x4);  /* Opcode FF /4 */
13654   ins_encode( OpcP, RegOpc(jump_target) );
13655   ins_pipe( pipe_jmp );
13656 %}
13657 
13658 
13659 // Tail Jump; remove the return address; jump to target.
13660 // TailCall above leaves the return address around.
13661 instruct tailjmpInd(eRegP_no_EBP jump_target, eAXRegP ex_oop) %{
13662   match( TailJump jump_target ex_oop );
13663   ins_cost(300);
13664   format %{ "POP    EDX\t# pop return address into dummy\n\t"
13665             "JMP    $jump_target " %}
13666   opcode(0xFF, 0x4);  /* Opcode FF /4 */
13667   ins_encode( enc_pop_rdx,
13668               OpcP, RegOpc(jump_target) );
13669   ins_pipe( pipe_jmp );
13670 %}
13671 
13672 // Create exception oop: created by stack-crawling runtime code.
13673 // Created exception is now available to this handler, and is setup
13674 // just prior to jumping to this handler.  No code emitted.
13675 instruct CreateException( eAXRegP ex_oop )
13676 %{
13677   match(Set ex_oop (CreateEx));
13678 
13679   size(0);
13680   // use the following format syntax
13681   format %{ "# exception oop is in EAX; no code emitted" %}
13682   ins_encode();
13683   ins_pipe( empty );
13684 %}
13685 
13686 
13687 // Rethrow exception:
13688 // The exception oop will come in the first argument position.
13689 // Then JUMP (not call) to the rethrow stub code.
13690 instruct RethrowException()
13691 %{
13692   match(Rethrow);
13693 
13694   // use the following format syntax
13695   format %{ "JMP    rethrow_stub" %}
13696   ins_encode(enc_rethrow);
13697   ins_pipe( pipe_jmp );
13698 %}
13699 
13700 // inlined locking and unlocking
13701 
13702 
13703 instruct cmpFastLock( eFlagsReg cr, eRegP object, eRegP box, eAXRegI tmp, eRegP scr) %{
13704   match( Set cr (FastLock object box) );
13705   effect( TEMP tmp, TEMP scr );
13706   ins_cost(300);
13707   format %{ "FASTLOCK $object, $box KILLS $tmp,$scr" %}
13708   ins_encode( Fast_Lock(object,box,tmp,scr) );
13709   ins_pipe( pipe_slow );
13710   ins_pc_relative(1);
13711 %}
13712 
13713 instruct cmpFastUnlock( eFlagsReg cr, eRegP object, eAXRegP box, eRegP tmp ) %{
13714   match( Set cr (FastUnlock object box) );
13715   effect( TEMP tmp );
13716   ins_cost(300);
13717   format %{ "FASTUNLOCK $object, $box, $tmp" %}
13718   ins_encode( Fast_Unlock(object,box,tmp) );
13719   ins_pipe( pipe_slow );
13720   ins_pc_relative(1);
13721 %}
13722 
13723 
13724 
13725 // ============================================================================
13726 // Safepoint Instruction
13727 instruct safePoint_poll(eFlagsReg cr) %{
13728   match(SafePoint);
13729   effect(KILL cr);
13730 
13731   // TODO-FIXME: we currently poll at offset 0 of the safepoint polling page.
13732   // On SPARC that might be acceptable as we can generate the address with
13733   // just a sethi, saving an or.  By polling at offset 0 we can end up
13734   // putting additional pressure on the index-0 in the D$.  Because of
13735   // alignment (just like the situation at hand) the lower indices tend
13736   // to see more traffic.  It'd be better to change the polling address
13737   // to offset 0 of the last $line in the polling page.
13738 
13739   format %{ "TSTL   #polladdr,EAX\t! Safepoint: poll for GC" %}
13740   ins_cost(125);
13741   size(6) ;
13742   ins_encode( Safepoint_Poll() );
13743   ins_pipe( ialu_reg_mem );
13744 %}
13745 
13746 //----------PEEPHOLE RULES-----------------------------------------------------
13747 // These must follow all instruction definitions as they use the names
13748 // defined in the instructions definitions.
13749 //
13750 // peepmatch ( root_instr_name [preceding_instruction]* );
13751 //
13752 // peepconstraint %{
13753 // (instruction_number.operand_name relational_op instruction_number.operand_name
13754 //  [, ...] );
13755 // // instruction numbers are zero-based using left to right order in peepmatch
13756 //
13757 // peepreplace ( instr_name  ( [instruction_number.operand_name]* ) );
13758 // // provide an instruction_number.operand_name for each operand that appears
13759 // // in the replacement instruction's match rule
13760 //
13761 // ---------VM FLAGS---------------------------------------------------------
13762 //
13763 // All peephole optimizations can be turned off using -XX:-OptoPeephole
13764 //
13765 // Each peephole rule is given an identifying number starting with zero and
13766 // increasing by one in the order seen by the parser.  An individual peephole
13767 // can be enabled, and all others disabled, by using -XX:OptoPeepholeAt=#
13768 // on the command-line.
13769 //
13770 // ---------CURRENT LIMITATIONS----------------------------------------------
13771 //
13772 // Only match adjacent instructions in same basic block
13773 // Only equality constraints
13774 // Only constraints between operands, not (0.dest_reg == EAX_enc)
13775 // Only one replacement instruction
13776 //
13777 // ---------EXAMPLE----------------------------------------------------------
13778 //
13779 // // pertinent parts of existing instructions in architecture description
13780 // instruct movI(eRegI dst, eRegI src) %{
13781 //   match(Set dst (CopyI src));
13782 // %}
13783 //
13784 // instruct incI_eReg(eRegI dst, immI1 src, eFlagsReg cr) %{
13785 //   match(Set dst (AddI dst src));
13786 //   effect(KILL cr);
13787 // %}
13788 //
13789 // // Change (inc mov) to lea
13790 // peephole %{
13791 //   // increment preceeded by register-register move
13792 //   peepmatch ( incI_eReg movI );
13793 //   // require that the destination register of the increment
13794 //   // match the destination register of the move
13795 //   peepconstraint ( 0.dst == 1.dst );
13796 //   // construct a replacement instruction that sets
13797 //   // the destination to ( move's source register + one )
13798 //   peepreplace ( leaI_eReg_immI( 0.dst 1.src 0.src ) );
13799 // %}
13800 //
13801 // Implementation no longer uses movX instructions since
13802 // machine-independent system no longer uses CopyX nodes.
13803 //
13804 // peephole %{
13805 //   peepmatch ( incI_eReg movI );
13806 //   peepconstraint ( 0.dst == 1.dst );
13807 //   peepreplace ( leaI_eReg_immI( 0.dst 1.src 0.src ) );
13808 // %}
13809 //
13810 // peephole %{
13811 //   peepmatch ( decI_eReg movI );
13812 //   peepconstraint ( 0.dst == 1.dst );
13813 //   peepreplace ( leaI_eReg_immI( 0.dst 1.src 0.src ) );
13814 // %}
13815 //
13816 // peephole %{
13817 //   peepmatch ( addI_eReg_imm movI );
13818 //   peepconstraint ( 0.dst == 1.dst );
13819 //   peepreplace ( leaI_eReg_immI( 0.dst 1.src 0.src ) );
13820 // %}
13821 //
13822 // peephole %{
13823 //   peepmatch ( addP_eReg_imm movP );
13824 //   peepconstraint ( 0.dst == 1.dst );
13825 //   peepreplace ( leaP_eReg_immI( 0.dst 1.src 0.src ) );
13826 // %}
13827 
13828 // // Change load of spilled value to only a spill
13829 // instruct storeI(memory mem, eRegI src) %{
13830 //   match(Set mem (StoreI mem src));
13831 // %}
13832 //
13833 // instruct loadI(eRegI dst, memory mem) %{
13834 //   match(Set dst (LoadI mem));
13835 // %}
13836 //
13837 peephole %{
13838   peepmatch ( loadI storeI );
13839   peepconstraint ( 1.src == 0.dst, 1.mem == 0.mem );
13840   peepreplace ( storeI( 1.mem 1.mem 1.src ) );
13841 %}
13842 
13843 //----------SMARTSPILL RULES---------------------------------------------------
13844 // These must follow all instruction definitions as they use the names
13845 // defined in the instructions definitions.