1 /*
   2  * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "assembler_x86.inline.hpp"
  27 #include "gc_interface/collectedHeap.inline.hpp"
  28 #include "interpreter/interpreter.hpp"
  29 #include "memory/cardTableModRefBS.hpp"
  30 #include "memory/resourceArea.hpp"
  31 #include "prims/methodHandles.hpp"
  32 #include "runtime/biasedLocking.hpp"
  33 #include "runtime/interfaceSupport.hpp"
  34 #include "runtime/objectMonitor.hpp"
  35 #include "runtime/os.hpp"
  36 #include "runtime/sharedRuntime.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #ifndef SERIALGC
  39 #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
  40 #include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
  41 #include "gc_implementation/g1/heapRegion.hpp"
  42 #endif
  43 
  44 // Implementation of AddressLiteral
  45 
  46 AddressLiteral::AddressLiteral(address target, relocInfo::relocType rtype) {
  47   _is_lval = false;
  48   _target = target;
  49   switch (rtype) {
  50   case relocInfo::oop_type:
  51     // Oops are a special case. Normally they would be their own section
  52     // but in cases like icBuffer they are literals in the code stream that
  53     // we don't have a section for. We use none so that we get a literal address
  54     // which is always patchable.
  55     break;
  56   case relocInfo::external_word_type:
  57     _rspec = external_word_Relocation::spec(target);
  58     break;
  59   case relocInfo::internal_word_type:
  60     _rspec = internal_word_Relocation::spec(target);
  61     break;
  62   case relocInfo::opt_virtual_call_type:
  63     _rspec = opt_virtual_call_Relocation::spec();
  64     break;
  65   case relocInfo::static_call_type:
  66     _rspec = static_call_Relocation::spec();
  67     break;
  68   case relocInfo::runtime_call_type:
  69     _rspec = runtime_call_Relocation::spec();
  70     break;
  71   case relocInfo::poll_type:
  72   case relocInfo::poll_return_type:
  73     _rspec = Relocation::spec_simple(rtype);
  74     break;
  75   case relocInfo::none:
  76     break;
  77   default:
  78     ShouldNotReachHere();
  79     break;
  80   }
  81 }
  82 
  83 // Implementation of Address
  84 
  85 #ifdef _LP64
  86 
  87 Address Address::make_array(ArrayAddress adr) {
  88   // Not implementable on 64bit machines
  89   // Should have been handled higher up the call chain.
  90   ShouldNotReachHere();
  91   return Address();
  92 }
  93 
  94 // exceedingly dangerous constructor
  95 Address::Address(int disp, address loc, relocInfo::relocType rtype) {
  96   _base  = noreg;
  97   _index = noreg;
  98   _scale = no_scale;
  99   _disp  = disp;
 100   switch (rtype) {
 101     case relocInfo::external_word_type:
 102       _rspec = external_word_Relocation::spec(loc);
 103       break;
 104     case relocInfo::internal_word_type:
 105       _rspec = internal_word_Relocation::spec(loc);
 106       break;
 107     case relocInfo::runtime_call_type:
 108       // HMM
 109       _rspec = runtime_call_Relocation::spec();
 110       break;
 111     case relocInfo::poll_type:
 112     case relocInfo::poll_return_type:
 113       _rspec = Relocation::spec_simple(rtype);
 114       break;
 115     case relocInfo::none:
 116       break;
 117     default:
 118       ShouldNotReachHere();
 119   }
 120 }
 121 #else // LP64
 122 
 123 Address Address::make_array(ArrayAddress adr) {
 124   AddressLiteral base = adr.base();
 125   Address index = adr.index();
 126   assert(index._disp == 0, "must not have disp"); // maybe it can?
 127   Address array(index._base, index._index, index._scale, (intptr_t) base.target());
 128   array._rspec = base._rspec;
 129   return array;
 130 }
 131 
 132 // exceedingly dangerous constructor
 133 Address::Address(address loc, RelocationHolder spec) {
 134   _base  = noreg;
 135   _index = noreg;
 136   _scale = no_scale;
 137   _disp  = (intptr_t) loc;
 138   _rspec = spec;
 139 }
 140 
 141 #endif // _LP64
 142 
 143 
 144 
 145 // Convert the raw encoding form into the form expected by the constructor for
 146 // Address.  An index of 4 (rsp) corresponds to having no index, so convert
 147 // that to noreg for the Address constructor.
 148 Address Address::make_raw(int base, int index, int scale, int disp, bool disp_is_oop) {
 149   RelocationHolder rspec;
 150   if (disp_is_oop) {
 151     rspec = Relocation::spec_simple(relocInfo::oop_type);
 152   }
 153   bool valid_index = index != rsp->encoding();
 154   if (valid_index) {
 155     Address madr(as_Register(base), as_Register(index), (Address::ScaleFactor)scale, in_ByteSize(disp));
 156     madr._rspec = rspec;
 157     return madr;
 158   } else {
 159     Address madr(as_Register(base), noreg, Address::no_scale, in_ByteSize(disp));
 160     madr._rspec = rspec;
 161     return madr;
 162   }
 163 }
 164 
 165 // Implementation of Assembler
 166 
 167 int AbstractAssembler::code_fill_byte() {
 168   return (u_char)'\xF4'; // hlt
 169 }
 170 
 171 // make this go away someday
 172 void Assembler::emit_data(jint data, relocInfo::relocType rtype, int format) {
 173   if (rtype == relocInfo::none)
 174         emit_long(data);
 175   else  emit_data(data, Relocation::spec_simple(rtype), format);
 176 }
 177 
 178 void Assembler::emit_data(jint data, RelocationHolder const& rspec, int format) {
 179   assert(imm_operand == 0, "default format must be immediate in this file");
 180   assert(inst_mark() != NULL, "must be inside InstructionMark");
 181   if (rspec.type() !=  relocInfo::none) {
 182     #ifdef ASSERT
 183       check_relocation(rspec, format);
 184     #endif
 185     // Do not use AbstractAssembler::relocate, which is not intended for
 186     // embedded words.  Instead, relocate to the enclosing instruction.
 187 
 188     // hack. call32 is too wide for mask so use disp32
 189     if (format == call32_operand)
 190       code_section()->relocate(inst_mark(), rspec, disp32_operand);
 191     else
 192       code_section()->relocate(inst_mark(), rspec, format);
 193   }
 194   emit_long(data);
 195 }
 196 
 197 static int encode(Register r) {
 198   int enc = r->encoding();
 199   if (enc >= 8) {
 200     enc -= 8;
 201   }
 202   return enc;
 203 }
 204 
 205 static int encode(XMMRegister r) {
 206   int enc = r->encoding();
 207   if (enc >= 8) {
 208     enc -= 8;
 209   }
 210   return enc;
 211 }
 212 
 213 void Assembler::emit_arith_b(int op1, int op2, Register dst, int imm8) {
 214   assert(dst->has_byte_register(), "must have byte register");
 215   assert(isByte(op1) && isByte(op2), "wrong opcode");
 216   assert(isByte(imm8), "not a byte");
 217   assert((op1 & 0x01) == 0, "should be 8bit operation");
 218   emit_byte(op1);
 219   emit_byte(op2 | encode(dst));
 220   emit_byte(imm8);
 221 }
 222 
 223 
 224 void Assembler::emit_arith(int op1, int op2, Register dst, int32_t imm32) {
 225   assert(isByte(op1) && isByte(op2), "wrong opcode");
 226   assert((op1 & 0x01) == 1, "should be 32bit operation");
 227   assert((op1 & 0x02) == 0, "sign-extension bit should not be set");
 228   if (is8bit(imm32)) {
 229     emit_byte(op1 | 0x02); // set sign bit
 230     emit_byte(op2 | encode(dst));
 231     emit_byte(imm32 & 0xFF);
 232   } else {
 233     emit_byte(op1);
 234     emit_byte(op2 | encode(dst));
 235     emit_long(imm32);
 236   }
 237 }
 238 
 239 // immediate-to-memory forms
 240 void Assembler::emit_arith_operand(int op1, Register rm, Address adr, int32_t imm32) {
 241   assert((op1 & 0x01) == 1, "should be 32bit operation");
 242   assert((op1 & 0x02) == 0, "sign-extension bit should not be set");
 243   if (is8bit(imm32)) {
 244     emit_byte(op1 | 0x02); // set sign bit
 245     emit_operand(rm, adr, 1);
 246     emit_byte(imm32 & 0xFF);
 247   } else {
 248     emit_byte(op1);
 249     emit_operand(rm, adr, 4);
 250     emit_long(imm32);
 251   }
 252 }
 253 
 254 void Assembler::emit_arith(int op1, int op2, Register dst, jobject obj) {
 255   LP64_ONLY(ShouldNotReachHere());
 256   assert(isByte(op1) && isByte(op2), "wrong opcode");
 257   assert((op1 & 0x01) == 1, "should be 32bit operation");
 258   assert((op1 & 0x02) == 0, "sign-extension bit should not be set");
 259   InstructionMark im(this);
 260   emit_byte(op1);
 261   emit_byte(op2 | encode(dst));
 262   emit_data((intptr_t)obj, relocInfo::oop_type, 0);
 263 }
 264 
 265 
 266 void Assembler::emit_arith(int op1, int op2, Register dst, Register src) {
 267   assert(isByte(op1) && isByte(op2), "wrong opcode");
 268   emit_byte(op1);
 269   emit_byte(op2 | encode(dst) << 3 | encode(src));
 270 }
 271 
 272 
 273 void Assembler::emit_operand(Register reg, Register base, Register index,
 274                              Address::ScaleFactor scale, int disp,
 275                              RelocationHolder const& rspec,
 276                              int rip_relative_correction) {
 277   relocInfo::relocType rtype = (relocInfo::relocType) rspec.type();
 278 
 279   // Encode the registers as needed in the fields they are used in
 280 
 281   int regenc = encode(reg) << 3;
 282   int indexenc = index->is_valid() ? encode(index) << 3 : 0;
 283   int baseenc = base->is_valid() ? encode(base) : 0;
 284 
 285   if (base->is_valid()) {
 286     if (index->is_valid()) {
 287       assert(scale != Address::no_scale, "inconsistent address");
 288       // [base + index*scale + disp]
 289       if (disp == 0 && rtype == relocInfo::none  &&
 290           base != rbp LP64_ONLY(&& base != r13)) {
 291         // [base + index*scale]
 292         // [00 reg 100][ss index base]
 293         assert(index != rsp, "illegal addressing mode");
 294         emit_byte(0x04 | regenc);
 295         emit_byte(scale << 6 | indexenc | baseenc);
 296       } else if (is8bit(disp) && rtype == relocInfo::none) {
 297         // [base + index*scale + imm8]
 298         // [01 reg 100][ss index base] imm8
 299         assert(index != rsp, "illegal addressing mode");
 300         emit_byte(0x44 | regenc);
 301         emit_byte(scale << 6 | indexenc | baseenc);
 302         emit_byte(disp & 0xFF);
 303       } else {
 304         // [base + index*scale + disp32]
 305         // [10 reg 100][ss index base] disp32
 306         assert(index != rsp, "illegal addressing mode");
 307         emit_byte(0x84 | regenc);
 308         emit_byte(scale << 6 | indexenc | baseenc);
 309         emit_data(disp, rspec, disp32_operand);
 310       }
 311     } else if (base == rsp LP64_ONLY(|| base == r12)) {
 312       // [rsp + disp]
 313       if (disp == 0 && rtype == relocInfo::none) {
 314         // [rsp]
 315         // [00 reg 100][00 100 100]
 316         emit_byte(0x04 | regenc);
 317         emit_byte(0x24);
 318       } else if (is8bit(disp) && rtype == relocInfo::none) {
 319         // [rsp + imm8]
 320         // [01 reg 100][00 100 100] disp8
 321         emit_byte(0x44 | regenc);
 322         emit_byte(0x24);
 323         emit_byte(disp & 0xFF);
 324       } else {
 325         // [rsp + imm32]
 326         // [10 reg 100][00 100 100] disp32
 327         emit_byte(0x84 | regenc);
 328         emit_byte(0x24);
 329         emit_data(disp, rspec, disp32_operand);
 330       }
 331     } else {
 332       // [base + disp]
 333       assert(base != rsp LP64_ONLY(&& base != r12), "illegal addressing mode");
 334       if (disp == 0 && rtype == relocInfo::none &&
 335           base != rbp LP64_ONLY(&& base != r13)) {
 336         // [base]
 337         // [00 reg base]
 338         emit_byte(0x00 | regenc | baseenc);
 339       } else if (is8bit(disp) && rtype == relocInfo::none) {
 340         // [base + disp8]
 341         // [01 reg base] disp8
 342         emit_byte(0x40 | regenc | baseenc);
 343         emit_byte(disp & 0xFF);
 344       } else {
 345         // [base + disp32]
 346         // [10 reg base] disp32
 347         emit_byte(0x80 | regenc | baseenc);
 348         emit_data(disp, rspec, disp32_operand);
 349       }
 350     }
 351   } else {
 352     if (index->is_valid()) {
 353       assert(scale != Address::no_scale, "inconsistent address");
 354       // [index*scale + disp]
 355       // [00 reg 100][ss index 101] disp32
 356       assert(index != rsp, "illegal addressing mode");
 357       emit_byte(0x04 | regenc);
 358       emit_byte(scale << 6 | indexenc | 0x05);
 359       emit_data(disp, rspec, disp32_operand);
 360     } else if (rtype != relocInfo::none ) {
 361       // [disp] (64bit) RIP-RELATIVE (32bit) abs
 362       // [00 000 101] disp32
 363 
 364       emit_byte(0x05 | regenc);
 365       // Note that the RIP-rel. correction applies to the generated
 366       // disp field, but _not_ to the target address in the rspec.
 367 
 368       // disp was created by converting the target address minus the pc
 369       // at the start of the instruction. That needs more correction here.
 370       // intptr_t disp = target - next_ip;
 371       assert(inst_mark() != NULL, "must be inside InstructionMark");
 372       address next_ip = pc() + sizeof(int32_t) + rip_relative_correction;
 373       int64_t adjusted = disp;
 374       // Do rip-rel adjustment for 64bit
 375       LP64_ONLY(adjusted -=  (next_ip - inst_mark()));
 376       assert(is_simm32(adjusted),
 377              "must be 32bit offset (RIP relative address)");
 378       emit_data((int32_t) adjusted, rspec, disp32_operand);
 379 
 380     } else {
 381       // 32bit never did this, did everything as the rip-rel/disp code above
 382       // [disp] ABSOLUTE
 383       // [00 reg 100][00 100 101] disp32
 384       emit_byte(0x04 | regenc);
 385       emit_byte(0x25);
 386       emit_data(disp, rspec, disp32_operand);
 387     }
 388   }
 389 }
 390 
 391 void Assembler::emit_operand(XMMRegister reg, Register base, Register index,
 392                              Address::ScaleFactor scale, int disp,
 393                              RelocationHolder const& rspec) {
 394   emit_operand((Register)reg, base, index, scale, disp, rspec);
 395 }
 396 
 397 // Secret local extension to Assembler::WhichOperand:
 398 #define end_pc_operand (_WhichOperand_limit)
 399 
 400 address Assembler::locate_operand(address inst, WhichOperand which) {
 401   // Decode the given instruction, and return the address of
 402   // an embedded 32-bit operand word.
 403 
 404   // If "which" is disp32_operand, selects the displacement portion
 405   // of an effective address specifier.
 406   // If "which" is imm64_operand, selects the trailing immediate constant.
 407   // If "which" is call32_operand, selects the displacement of a call or jump.
 408   // Caller is responsible for ensuring that there is such an operand,
 409   // and that it is 32/64 bits wide.
 410 
 411   // If "which" is end_pc_operand, find the end of the instruction.
 412 
 413   address ip = inst;
 414   bool is_64bit = false;
 415 
 416   debug_only(bool has_disp32 = false);
 417   int tail_size = 0; // other random bytes (#32, #16, etc.) at end of insn
 418 
 419   again_after_prefix:
 420   switch (0xFF & *ip++) {
 421 
 422   // These convenience macros generate groups of "case" labels for the switch.
 423 #define REP4(x) (x)+0: case (x)+1: case (x)+2: case (x)+3
 424 #define REP8(x) (x)+0: case (x)+1: case (x)+2: case (x)+3: \
 425              case (x)+4: case (x)+5: case (x)+6: case (x)+7
 426 #define REP16(x) REP8((x)+0): \
 427               case REP8((x)+8)
 428 
 429   case CS_segment:
 430   case SS_segment:
 431   case DS_segment:
 432   case ES_segment:
 433   case FS_segment:
 434   case GS_segment:
 435     // Seems dubious
 436     LP64_ONLY(assert(false, "shouldn't have that prefix"));
 437     assert(ip == inst+1, "only one prefix allowed");
 438     goto again_after_prefix;
 439 
 440   case 0x67:
 441   case REX:
 442   case REX_B:
 443   case REX_X:
 444   case REX_XB:
 445   case REX_R:
 446   case REX_RB:
 447   case REX_RX:
 448   case REX_RXB:
 449     NOT_LP64(assert(false, "64bit prefixes"));
 450     goto again_after_prefix;
 451 
 452   case REX_W:
 453   case REX_WB:
 454   case REX_WX:
 455   case REX_WXB:
 456   case REX_WR:
 457   case REX_WRB:
 458   case REX_WRX:
 459   case REX_WRXB:
 460     NOT_LP64(assert(false, "64bit prefixes"));
 461     is_64bit = true;
 462     goto again_after_prefix;
 463 
 464   case 0xFF: // pushq a; decl a; incl a; call a; jmp a
 465   case 0x88: // movb a, r
 466   case 0x89: // movl a, r
 467   case 0x8A: // movb r, a
 468   case 0x8B: // movl r, a
 469   case 0x8F: // popl a
 470     debug_only(has_disp32 = true);
 471     break;
 472 
 473   case 0x68: // pushq #32
 474     if (which == end_pc_operand) {
 475       return ip + 4;
 476     }
 477     assert(which == imm_operand && !is_64bit, "pushl has no disp32 or 64bit immediate");
 478     return ip;                  // not produced by emit_operand
 479 
 480   case 0x66: // movw ... (size prefix)
 481     again_after_size_prefix2:
 482     switch (0xFF & *ip++) {
 483     case REX:
 484     case REX_B:
 485     case REX_X:
 486     case REX_XB:
 487     case REX_R:
 488     case REX_RB:
 489     case REX_RX:
 490     case REX_RXB:
 491     case REX_W:
 492     case REX_WB:
 493     case REX_WX:
 494     case REX_WXB:
 495     case REX_WR:
 496     case REX_WRB:
 497     case REX_WRX:
 498     case REX_WRXB:
 499       NOT_LP64(assert(false, "64bit prefix found"));
 500       goto again_after_size_prefix2;
 501     case 0x8B: // movw r, a
 502     case 0x89: // movw a, r
 503       debug_only(has_disp32 = true);
 504       break;
 505     case 0xC7: // movw a, #16
 506       debug_only(has_disp32 = true);
 507       tail_size = 2;  // the imm16
 508       break;
 509     case 0x0F: // several SSE/SSE2 variants
 510       ip--;    // reparse the 0x0F
 511       goto again_after_prefix;
 512     default:
 513       ShouldNotReachHere();
 514     }
 515     break;
 516 
 517   case REP8(0xB8): // movl/q r, #32/#64(oop?)
 518     if (which == end_pc_operand)  return ip + (is_64bit ? 8 : 4);
 519     // these asserts are somewhat nonsensical
 520 #ifndef _LP64
 521     assert(which == imm_operand || which == disp32_operand, "");
 522 #else
 523     assert((which == call32_operand || which == imm_operand) && is_64bit ||
 524            which == narrow_oop_operand && !is_64bit, "");
 525 #endif // _LP64
 526     return ip;
 527 
 528   case 0x69: // imul r, a, #32
 529   case 0xC7: // movl a, #32(oop?)
 530     tail_size = 4;
 531     debug_only(has_disp32 = true); // has both kinds of operands!
 532     break;
 533 
 534   case 0x0F: // movx..., etc.
 535     switch (0xFF & *ip++) {
 536     case 0x3A: // pcmpestri
 537       tail_size = 1;
 538     case 0x38: // ptest, pmovzxbw
 539       ip++; // skip opcode
 540       debug_only(has_disp32 = true); // has both kinds of operands!
 541       break;
 542 
 543     case 0x70: // pshufd r, r/a, #8
 544       debug_only(has_disp32 = true); // has both kinds of operands!
 545     case 0x73: // psrldq r, #8
 546       tail_size = 1;
 547       break;
 548 
 549     case 0x12: // movlps
 550     case 0x28: // movaps
 551     case 0x2E: // ucomiss
 552     case 0x2F: // comiss
 553     case 0x54: // andps
 554     case 0x55: // andnps
 555     case 0x56: // orps
 556     case 0x57: // xorps
 557     case 0x6E: // movd
 558     case 0x7E: // movd
 559     case 0xAE: // ldmxcsr, stmxcsr, fxrstor, fxsave, clflush
 560       debug_only(has_disp32 = true);
 561       break;
 562 
 563     case 0xAD: // shrd r, a, %cl
 564     case 0xAF: // imul r, a
 565     case 0xBE: // movsbl r, a (movsxb)
 566     case 0xBF: // movswl r, a (movsxw)
 567     case 0xB6: // movzbl r, a (movzxb)
 568     case 0xB7: // movzwl r, a (movzxw)
 569     case REP16(0x40): // cmovl cc, r, a
 570     case 0xB0: // cmpxchgb
 571     case 0xB1: // cmpxchg
 572     case 0xC1: // xaddl
 573     case 0xC7: // cmpxchg8
 574     case REP16(0x90): // setcc a
 575       debug_only(has_disp32 = true);
 576       // fall out of the switch to decode the address
 577       break;
 578 
 579     case 0xC4: // pinsrw r, a, #8
 580       debug_only(has_disp32 = true);
 581     case 0xC5: // pextrw r, r, #8
 582       tail_size = 1;  // the imm8
 583       break;
 584 
 585     case 0xAC: // shrd r, a, #8
 586       debug_only(has_disp32 = true);
 587       tail_size = 1;  // the imm8
 588       break;
 589 
 590     case REP16(0x80): // jcc rdisp32
 591       if (which == end_pc_operand)  return ip + 4;
 592       assert(which == call32_operand, "jcc has no disp32 or imm");
 593       return ip;
 594     default:
 595       ShouldNotReachHere();
 596     }
 597     break;
 598 
 599   case 0x81: // addl a, #32; addl r, #32
 600     // also: orl, adcl, sbbl, andl, subl, xorl, cmpl
 601     // on 32bit in the case of cmpl, the imm might be an oop
 602     tail_size = 4;
 603     debug_only(has_disp32 = true); // has both kinds of operands!
 604     break;
 605 
 606   case 0x83: // addl a, #8; addl r, #8
 607     // also: orl, adcl, sbbl, andl, subl, xorl, cmpl
 608     debug_only(has_disp32 = true); // has both kinds of operands!
 609     tail_size = 1;
 610     break;
 611 
 612   case 0x9B:
 613     switch (0xFF & *ip++) {
 614     case 0xD9: // fnstcw a
 615       debug_only(has_disp32 = true);
 616       break;
 617     default:
 618       ShouldNotReachHere();
 619     }
 620     break;
 621 
 622   case REP4(0x00): // addb a, r; addl a, r; addb r, a; addl r, a
 623   case REP4(0x10): // adc...
 624   case REP4(0x20): // and...
 625   case REP4(0x30): // xor...
 626   case REP4(0x08): // or...
 627   case REP4(0x18): // sbb...
 628   case REP4(0x28): // sub...
 629   case 0xF7: // mull a
 630   case 0x8D: // lea r, a
 631   case 0x87: // xchg r, a
 632   case REP4(0x38): // cmp...
 633   case 0x85: // test r, a
 634     debug_only(has_disp32 = true); // has both kinds of operands!
 635     break;
 636 
 637   case 0xC1: // sal a, #8; sar a, #8; shl a, #8; shr a, #8
 638   case 0xC6: // movb a, #8
 639   case 0x80: // cmpb a, #8
 640   case 0x6B: // imul r, a, #8
 641     debug_only(has_disp32 = true); // has both kinds of operands!
 642     tail_size = 1; // the imm8
 643     break;
 644 
 645   case 0xC4: // VEX_3bytes
 646   case 0xC5: // VEX_2bytes
 647     assert((UseAVX > 0), "shouldn't have VEX prefix");
 648     assert(ip == inst+1, "no prefixes allowed");
 649     // C4 and C5 are also used as opcodes for PINSRW and PEXTRW instructions
 650     // but they have prefix 0x0F and processed when 0x0F processed above.
 651     //
 652     // In 32-bit mode the VEX first byte C4 and C5 alias onto LDS and LES
 653     // instructions (these instructions are not supported in 64-bit mode).
 654     // To distinguish them bits [7:6] are set in the VEX second byte since
 655     // ModRM byte can not be of the form 11xxxxxx in 32-bit mode. To set
 656     // those VEX bits REX and vvvv bits are inverted.
 657     //
 658     // Fortunately C2 doesn't generate these instructions so we don't need
 659     // to check for them in product version.
 660 
 661     // Check second byte
 662     NOT_LP64(assert((0xC0 & *ip) == 0xC0, "shouldn't have LDS and LES instructions"));
 663 
 664     // First byte
 665     if ((0xFF & *inst) == VEX_3bytes) {
 666       ip++; // third byte
 667       is_64bit = ((VEX_W & *ip) == VEX_W);
 668     }
 669     ip++; // opcode
 670     // To find the end of instruction (which == end_pc_operand).
 671     switch (0xFF & *ip) {
 672     case 0x61: // pcmpestri r, r/a, #8
 673     case 0x70: // pshufd r, r/a, #8
 674     case 0x73: // psrldq r, #8
 675       tail_size = 1;  // the imm8
 676       break;
 677     default:
 678       break;
 679     }
 680     ip++; // skip opcode
 681     debug_only(has_disp32 = true); // has both kinds of operands!
 682     break;
 683 
 684   case 0xD1: // sal a, 1; sar a, 1; shl a, 1; shr a, 1
 685   case 0xD3: // sal a, %cl; sar a, %cl; shl a, %cl; shr a, %cl
 686   case 0xD9: // fld_s a; fst_s a; fstp_s a; fldcw a
 687   case 0xDD: // fld_d a; fst_d a; fstp_d a
 688   case 0xDB: // fild_s a; fistp_s a; fld_x a; fstp_x a
 689   case 0xDF: // fild_d a; fistp_d a
 690   case 0xD8: // fadd_s a; fsubr_s a; fmul_s a; fdivr_s a; fcomp_s a
 691   case 0xDC: // fadd_d a; fsubr_d a; fmul_d a; fdivr_d a; fcomp_d a
 692   case 0xDE: // faddp_d a; fsubrp_d a; fmulp_d a; fdivrp_d a; fcompp_d a
 693     debug_only(has_disp32 = true);
 694     break;
 695 
 696   case 0xE8: // call rdisp32
 697   case 0xE9: // jmp  rdisp32
 698     if (which == end_pc_operand)  return ip + 4;
 699     assert(which == call32_operand, "call has no disp32 or imm");
 700     return ip;
 701 
 702   case 0xF0:                    // Lock
 703     assert(os::is_MP(), "only on MP");
 704     goto again_after_prefix;
 705 
 706   case 0xF3:                    // For SSE
 707   case 0xF2:                    // For SSE2
 708     switch (0xFF & *ip++) {
 709     case REX:
 710     case REX_B:
 711     case REX_X:
 712     case REX_XB:
 713     case REX_R:
 714     case REX_RB:
 715     case REX_RX:
 716     case REX_RXB:
 717     case REX_W:
 718     case REX_WB:
 719     case REX_WX:
 720     case REX_WXB:
 721     case REX_WR:
 722     case REX_WRB:
 723     case REX_WRX:
 724     case REX_WRXB:
 725       NOT_LP64(assert(false, "found 64bit prefix"));
 726       ip++;
 727     default:
 728       ip++;
 729     }
 730     debug_only(has_disp32 = true); // has both kinds of operands!
 731     break;
 732 
 733   default:
 734     ShouldNotReachHere();
 735 
 736 #undef REP8
 737 #undef REP16
 738   }
 739 
 740   assert(which != call32_operand, "instruction is not a call, jmp, or jcc");
 741 #ifdef _LP64
 742   assert(which != imm_operand, "instruction is not a movq reg, imm64");
 743 #else
 744   // assert(which != imm_operand || has_imm32, "instruction has no imm32 field");
 745   assert(which != imm_operand || has_disp32, "instruction has no imm32 field");
 746 #endif // LP64
 747   assert(which != disp32_operand || has_disp32, "instruction has no disp32 field");
 748 
 749   // parse the output of emit_operand
 750   int op2 = 0xFF & *ip++;
 751   int base = op2 & 0x07;
 752   int op3 = -1;
 753   const int b100 = 4;
 754   const int b101 = 5;
 755   if (base == b100 && (op2 >> 6) != 3) {
 756     op3 = 0xFF & *ip++;
 757     base = op3 & 0x07;   // refetch the base
 758   }
 759   // now ip points at the disp (if any)
 760 
 761   switch (op2 >> 6) {
 762   case 0:
 763     // [00 reg  100][ss index base]
 764     // [00 reg  100][00   100  esp]
 765     // [00 reg base]
 766     // [00 reg  100][ss index  101][disp32]
 767     // [00 reg  101]               [disp32]
 768 
 769     if (base == b101) {
 770       if (which == disp32_operand)
 771         return ip;              // caller wants the disp32
 772       ip += 4;                  // skip the disp32
 773     }
 774     break;
 775 
 776   case 1:
 777     // [01 reg  100][ss index base][disp8]
 778     // [01 reg  100][00   100  esp][disp8]
 779     // [01 reg base]               [disp8]
 780     ip += 1;                    // skip the disp8
 781     break;
 782 
 783   case 2:
 784     // [10 reg  100][ss index base][disp32]
 785     // [10 reg  100][00   100  esp][disp32]
 786     // [10 reg base]               [disp32]
 787     if (which == disp32_operand)
 788       return ip;                // caller wants the disp32
 789     ip += 4;                    // skip the disp32
 790     break;
 791 
 792   case 3:
 793     // [11 reg base]  (not a memory addressing mode)
 794     break;
 795   }
 796 
 797   if (which == end_pc_operand) {
 798     return ip + tail_size;
 799   }
 800 
 801 #ifdef _LP64
 802   assert(which == narrow_oop_operand && !is_64bit, "instruction is not a movl adr, imm32");
 803 #else
 804   assert(which == imm_operand, "instruction has only an imm field");
 805 #endif // LP64
 806   return ip;
 807 }
 808 
 809 address Assembler::locate_next_instruction(address inst) {
 810   // Secretly share code with locate_operand:
 811   return locate_operand(inst, end_pc_operand);
 812 }
 813 
 814 
 815 #ifdef ASSERT
 816 void Assembler::check_relocation(RelocationHolder const& rspec, int format) {
 817   address inst = inst_mark();
 818   assert(inst != NULL && inst < pc(), "must point to beginning of instruction");
 819   address opnd;
 820 
 821   Relocation* r = rspec.reloc();
 822   if (r->type() == relocInfo::none) {
 823     return;
 824   } else if (r->is_call() || format == call32_operand) {
 825     // assert(format == imm32_operand, "cannot specify a nonzero format");
 826     opnd = locate_operand(inst, call32_operand);
 827   } else if (r->is_data()) {
 828     assert(format == imm_operand || format == disp32_operand
 829            LP64_ONLY(|| format == narrow_oop_operand), "format ok");
 830     opnd = locate_operand(inst, (WhichOperand)format);
 831   } else {
 832     assert(format == imm_operand, "cannot specify a format");
 833     return;
 834   }
 835   assert(opnd == pc(), "must put operand where relocs can find it");
 836 }
 837 #endif // ASSERT
 838 
 839 void Assembler::emit_operand32(Register reg, Address adr) {
 840   assert(reg->encoding() < 8, "no extended registers");
 841   assert(!adr.base_needs_rex() && !adr.index_needs_rex(), "no extended registers");
 842   emit_operand(reg, adr._base, adr._index, adr._scale, adr._disp,
 843                adr._rspec);
 844 }
 845 
 846 void Assembler::emit_operand(Register reg, Address adr,
 847                              int rip_relative_correction) {
 848   emit_operand(reg, adr._base, adr._index, adr._scale, adr._disp,
 849                adr._rspec,
 850                rip_relative_correction);
 851 }
 852 
 853 void Assembler::emit_operand(XMMRegister reg, Address adr) {
 854   emit_operand(reg, adr._base, adr._index, adr._scale, adr._disp,
 855                adr._rspec);
 856 }
 857 
 858 // MMX operations
 859 void Assembler::emit_operand(MMXRegister reg, Address adr) {
 860   assert(!adr.base_needs_rex() && !adr.index_needs_rex(), "no extended registers");
 861   emit_operand((Register)reg, adr._base, adr._index, adr._scale, adr._disp, adr._rspec);
 862 }
 863 
 864 // work around gcc (3.2.1-7a) bug
 865 void Assembler::emit_operand(Address adr, MMXRegister reg) {
 866   assert(!adr.base_needs_rex() && !adr.index_needs_rex(), "no extended registers");
 867   emit_operand((Register)reg, adr._base, adr._index, adr._scale, adr._disp, adr._rspec);
 868 }
 869 
 870 
 871 void Assembler::emit_farith(int b1, int b2, int i) {
 872   assert(isByte(b1) && isByte(b2), "wrong opcode");
 873   assert(0 <= i &&  i < 8, "illegal stack offset");
 874   emit_byte(b1);
 875   emit_byte(b2 + i);
 876 }
 877 
 878 
 879 // Now the Assembler instructions (identical for 32/64 bits)
 880 
 881 void Assembler::adcl(Address dst, int32_t imm32) {
 882   InstructionMark im(this);
 883   prefix(dst);
 884   emit_arith_operand(0x81, rdx, dst, imm32);
 885 }
 886 
 887 void Assembler::adcl(Address dst, Register src) {
 888   InstructionMark im(this);
 889   prefix(dst, src);
 890   emit_byte(0x11);
 891   emit_operand(src, dst);
 892 }
 893 
 894 void Assembler::adcl(Register dst, int32_t imm32) {
 895   prefix(dst);
 896   emit_arith(0x81, 0xD0, dst, imm32);
 897 }
 898 
 899 void Assembler::adcl(Register dst, Address src) {
 900   InstructionMark im(this);
 901   prefix(src, dst);
 902   emit_byte(0x13);
 903   emit_operand(dst, src);
 904 }
 905 
 906 void Assembler::adcl(Register dst, Register src) {
 907   (void) prefix_and_encode(dst->encoding(), src->encoding());
 908   emit_arith(0x13, 0xC0, dst, src);
 909 }
 910 
 911 void Assembler::addl(Address dst, int32_t imm32) {
 912   InstructionMark im(this);
 913   prefix(dst);
 914   emit_arith_operand(0x81, rax, dst, imm32);
 915 }
 916 
 917 void Assembler::addl(Address dst, Register src) {
 918   InstructionMark im(this);
 919   prefix(dst, src);
 920   emit_byte(0x01);
 921   emit_operand(src, dst);
 922 }
 923 
 924 void Assembler::addl(Register dst, int32_t imm32) {
 925   prefix(dst);
 926   emit_arith(0x81, 0xC0, dst, imm32);
 927 }
 928 
 929 void Assembler::addl(Register dst, Address src) {
 930   InstructionMark im(this);
 931   prefix(src, dst);
 932   emit_byte(0x03);
 933   emit_operand(dst, src);
 934 }
 935 
 936 void Assembler::addl(Register dst, Register src) {
 937   (void) prefix_and_encode(dst->encoding(), src->encoding());
 938   emit_arith(0x03, 0xC0, dst, src);
 939 }
 940 
 941 void Assembler::addr_nop_4() {
 942   // 4 bytes: NOP DWORD PTR [EAX+0]
 943   emit_byte(0x0F);
 944   emit_byte(0x1F);
 945   emit_byte(0x40); // emit_rm(cbuf, 0x1, EAX_enc, EAX_enc);
 946   emit_byte(0);    // 8-bits offset (1 byte)
 947 }
 948 
 949 void Assembler::addr_nop_5() {
 950   // 5 bytes: NOP DWORD PTR [EAX+EAX*0+0] 8-bits offset
 951   emit_byte(0x0F);
 952   emit_byte(0x1F);
 953   emit_byte(0x44); // emit_rm(cbuf, 0x1, EAX_enc, 0x4);
 954   emit_byte(0x00); // emit_rm(cbuf, 0x0, EAX_enc, EAX_enc);
 955   emit_byte(0);    // 8-bits offset (1 byte)
 956 }
 957 
 958 void Assembler::addr_nop_7() {
 959   // 7 bytes: NOP DWORD PTR [EAX+0] 32-bits offset
 960   emit_byte(0x0F);
 961   emit_byte(0x1F);
 962   emit_byte(0x80); // emit_rm(cbuf, 0x2, EAX_enc, EAX_enc);
 963   emit_long(0);    // 32-bits offset (4 bytes)
 964 }
 965 
 966 void Assembler::addr_nop_8() {
 967   // 8 bytes: NOP DWORD PTR [EAX+EAX*0+0] 32-bits offset
 968   emit_byte(0x0F);
 969   emit_byte(0x1F);
 970   emit_byte(0x84); // emit_rm(cbuf, 0x2, EAX_enc, 0x4);
 971   emit_byte(0x00); // emit_rm(cbuf, 0x0, EAX_enc, EAX_enc);
 972   emit_long(0);    // 32-bits offset (4 bytes)
 973 }
 974 
 975 void Assembler::addsd(XMMRegister dst, XMMRegister src) {
 976   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
 977   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
 978   emit_byte(0x58);
 979   emit_byte(0xC0 | encode);
 980 }
 981 
 982 void Assembler::addsd(XMMRegister dst, Address src) {
 983   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
 984   InstructionMark im(this);
 985   simd_prefix(dst, dst, src, VEX_SIMD_F2);
 986   emit_byte(0x58);
 987   emit_operand(dst, src);
 988 }
 989 
 990 void Assembler::addss(XMMRegister dst, XMMRegister src) {
 991   NOT_LP64(assert(VM_Version::supports_sse(), ""));
 992   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
 993   emit_byte(0x58);
 994   emit_byte(0xC0 | encode);
 995 }
 996 
 997 void Assembler::addss(XMMRegister dst, Address src) {
 998   NOT_LP64(assert(VM_Version::supports_sse(), ""));
 999   InstructionMark im(this);
1000   simd_prefix(dst, dst, src, VEX_SIMD_F3);
1001   emit_byte(0x58);
1002   emit_operand(dst, src);
1003 }
1004 
1005 void Assembler::andl(Address dst, int32_t imm32) {
1006   InstructionMark im(this);
1007   prefix(dst);
1008   emit_byte(0x81);
1009   emit_operand(rsp, dst, 4);
1010   emit_long(imm32);
1011 }
1012 
1013 void Assembler::andl(Register dst, int32_t imm32) {
1014   prefix(dst);
1015   emit_arith(0x81, 0xE0, dst, imm32);
1016 }
1017 
1018 void Assembler::andl(Register dst, Address src) {
1019   InstructionMark im(this);
1020   prefix(src, dst);
1021   emit_byte(0x23);
1022   emit_operand(dst, src);
1023 }
1024 
1025 void Assembler::andl(Register dst, Register src) {
1026   (void) prefix_and_encode(dst->encoding(), src->encoding());
1027   emit_arith(0x23, 0xC0, dst, src);
1028 }
1029 
1030 void Assembler::andpd(XMMRegister dst, Address src) {
1031   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1032   InstructionMark im(this);
1033   simd_prefix(dst, dst, src, VEX_SIMD_66);
1034   emit_byte(0x54);
1035   emit_operand(dst, src);
1036 }
1037 
1038 void Assembler::andpd(XMMRegister dst, XMMRegister src) {
1039   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1040   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
1041   emit_byte(0x54);
1042   emit_byte(0xC0 | encode);
1043 }
1044 
1045 void Assembler::andps(XMMRegister dst, Address src) {
1046   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1047   InstructionMark im(this);
1048   simd_prefix(dst, dst, src, VEX_SIMD_NONE);
1049   emit_byte(0x54);
1050   emit_operand(dst, src);
1051 }
1052 
1053 void Assembler::andps(XMMRegister dst, XMMRegister src) {
1054   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1055   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_NONE);
1056   emit_byte(0x54);
1057   emit_byte(0xC0 | encode);
1058 }
1059 
1060 void Assembler::bsfl(Register dst, Register src) {
1061   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1062   emit_byte(0x0F);
1063   emit_byte(0xBC);
1064   emit_byte(0xC0 | encode);
1065 }
1066 
1067 void Assembler::bsrl(Register dst, Register src) {
1068   assert(!VM_Version::supports_lzcnt(), "encoding is treated as LZCNT");
1069   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1070   emit_byte(0x0F);
1071   emit_byte(0xBD);
1072   emit_byte(0xC0 | encode);
1073 }
1074 
1075 void Assembler::bswapl(Register reg) { // bswap
1076   int encode = prefix_and_encode(reg->encoding());
1077   emit_byte(0x0F);
1078   emit_byte(0xC8 | encode);
1079 }
1080 
1081 void Assembler::call(Label& L, relocInfo::relocType rtype) {
1082   // suspect disp32 is always good
1083   int operand = LP64_ONLY(disp32_operand) NOT_LP64(imm_operand);
1084 
1085   if (L.is_bound()) {
1086     const int long_size = 5;
1087     int offs = (int)( target(L) - pc() );
1088     assert(offs <= 0, "assembler error");
1089     InstructionMark im(this);
1090     // 1110 1000 #32-bit disp
1091     emit_byte(0xE8);
1092     emit_data(offs - long_size, rtype, operand);
1093   } else {
1094     InstructionMark im(this);
1095     // 1110 1000 #32-bit disp
1096     L.add_patch_at(code(), locator());
1097 
1098     emit_byte(0xE8);
1099     emit_data(int(0), rtype, operand);
1100   }
1101 }
1102 
1103 void Assembler::call(Register dst) {
1104   int encode = prefix_and_encode(dst->encoding());
1105   emit_byte(0xFF);
1106   emit_byte(0xD0 | encode);
1107 }
1108 
1109 
1110 void Assembler::call(Address adr) {
1111   InstructionMark im(this);
1112   prefix(adr);
1113   emit_byte(0xFF);
1114   emit_operand(rdx, adr);
1115 }
1116 
1117 void Assembler::call_literal(address entry, RelocationHolder const& rspec) {
1118   assert(entry != NULL, "call most probably wrong");
1119   InstructionMark im(this);
1120   emit_byte(0xE8);
1121   intptr_t disp = entry - (_code_pos + sizeof(int32_t));
1122   assert(is_simm32(disp), "must be 32bit offset (call2)");
1123   // Technically, should use call32_operand, but this format is
1124   // implied by the fact that we're emitting a call instruction.
1125 
1126   int operand = LP64_ONLY(disp32_operand) NOT_LP64(call32_operand);
1127   emit_data((int) disp, rspec, operand);
1128 }
1129 
1130 void Assembler::cdql() {
1131   emit_byte(0x99);
1132 }
1133 
1134 void Assembler::cmovl(Condition cc, Register dst, Register src) {
1135   NOT_LP64(guarantee(VM_Version::supports_cmov(), "illegal instruction"));
1136   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1137   emit_byte(0x0F);
1138   emit_byte(0x40 | cc);
1139   emit_byte(0xC0 | encode);
1140 }
1141 
1142 
1143 void Assembler::cmovl(Condition cc, Register dst, Address src) {
1144   NOT_LP64(guarantee(VM_Version::supports_cmov(), "illegal instruction"));
1145   prefix(src, dst);
1146   emit_byte(0x0F);
1147   emit_byte(0x40 | cc);
1148   emit_operand(dst, src);
1149 }
1150 
1151 void Assembler::cmpb(Address dst, int imm8) {
1152   InstructionMark im(this);
1153   prefix(dst);
1154   emit_byte(0x80);
1155   emit_operand(rdi, dst, 1);
1156   emit_byte(imm8);
1157 }
1158 
1159 void Assembler::cmpl(Address dst, int32_t imm32) {
1160   InstructionMark im(this);
1161   prefix(dst);
1162   emit_byte(0x81);
1163   emit_operand(rdi, dst, 4);
1164   emit_long(imm32);
1165 }
1166 
1167 void Assembler::cmpl(Register dst, int32_t imm32) {
1168   prefix(dst);
1169   emit_arith(0x81, 0xF8, dst, imm32);
1170 }
1171 
1172 void Assembler::cmpl(Register dst, Register src) {
1173   (void) prefix_and_encode(dst->encoding(), src->encoding());
1174   emit_arith(0x3B, 0xC0, dst, src);
1175 }
1176 
1177 
1178 void Assembler::cmpl(Register dst, Address  src) {
1179   InstructionMark im(this);
1180   prefix(src, dst);
1181   emit_byte(0x3B);
1182   emit_operand(dst, src);
1183 }
1184 
1185 void Assembler::cmpw(Address dst, int imm16) {
1186   InstructionMark im(this);
1187   assert(!dst.base_needs_rex() && !dst.index_needs_rex(), "no extended registers");
1188   emit_byte(0x66);
1189   emit_byte(0x81);
1190   emit_operand(rdi, dst, 2);
1191   emit_word(imm16);
1192 }
1193 
1194 // The 32-bit cmpxchg compares the value at adr with the contents of rax,
1195 // and stores reg into adr if so; otherwise, the value at adr is loaded into rax,.
1196 // The ZF is set if the compared values were equal, and cleared otherwise.
1197 void Assembler::cmpxchgl(Register reg, Address adr) { // cmpxchg
1198   if (Atomics & 2) {
1199      // caveat: no instructionmark, so this isn't relocatable.
1200      // Emit a synthetic, non-atomic, CAS equivalent.
1201      // Beware.  The synthetic form sets all ICCs, not just ZF.
1202      // cmpxchg r,[m] is equivalent to rax, = CAS (m, rax, r)
1203      cmpl(rax, adr);
1204      movl(rax, adr);
1205      if (reg != rax) {
1206         Label L ;
1207         jcc(Assembler::notEqual, L);
1208         movl(adr, reg);
1209         bind(L);
1210      }
1211   } else {
1212      InstructionMark im(this);
1213      prefix(adr, reg);
1214      emit_byte(0x0F);
1215      emit_byte(0xB1);
1216      emit_operand(reg, adr);
1217   }
1218 }
1219 
1220 void Assembler::comisd(XMMRegister dst, Address src) {
1221   // NOTE: dbx seems to decode this as comiss even though the
1222   // 0x66 is there. Strangly ucomisd comes out correct
1223   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1224   InstructionMark im(this);
1225   simd_prefix(dst, src, VEX_SIMD_66);
1226   emit_byte(0x2F);
1227   emit_operand(dst, src);
1228 }
1229 
1230 void Assembler::comisd(XMMRegister dst, XMMRegister src) {
1231   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1232   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
1233   emit_byte(0x2F);
1234   emit_byte(0xC0 | encode);
1235 }
1236 
1237 void Assembler::comiss(XMMRegister dst, Address src) {
1238   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1239   InstructionMark im(this);
1240   simd_prefix(dst, src, VEX_SIMD_NONE);
1241   emit_byte(0x2F);
1242   emit_operand(dst, src);
1243 }
1244 
1245 void Assembler::comiss(XMMRegister dst, XMMRegister src) {
1246   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1247   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
1248   emit_byte(0x2F);
1249   emit_byte(0xC0 | encode);
1250 }
1251 
1252 void Assembler::cvtdq2pd(XMMRegister dst, XMMRegister src) {
1253   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1254   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3);
1255   emit_byte(0xE6);
1256   emit_byte(0xC0 | encode);
1257 }
1258 
1259 void Assembler::cvtdq2ps(XMMRegister dst, XMMRegister src) {
1260   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1261   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
1262   emit_byte(0x5B);
1263   emit_byte(0xC0 | encode);
1264 }
1265 
1266 void Assembler::cvtsd2ss(XMMRegister dst, XMMRegister src) {
1267   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1268   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
1269   emit_byte(0x5A);
1270   emit_byte(0xC0 | encode);
1271 }
1272 
1273 void Assembler::cvtsd2ss(XMMRegister dst, Address src) {
1274   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1275   InstructionMark im(this);
1276   simd_prefix(dst, dst, src, VEX_SIMD_F2);
1277   emit_byte(0x5A);
1278   emit_operand(dst, src);
1279 }
1280 
1281 void Assembler::cvtsi2sdl(XMMRegister dst, Register src) {
1282   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1283   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
1284   emit_byte(0x2A);
1285   emit_byte(0xC0 | encode);
1286 }
1287 
1288 void Assembler::cvtsi2sdl(XMMRegister dst, Address src) {
1289   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1290   InstructionMark im(this);
1291   simd_prefix(dst, dst, src, VEX_SIMD_F2);
1292   emit_byte(0x2A);
1293   emit_operand(dst, src);
1294 }
1295 
1296 void Assembler::cvtsi2ssl(XMMRegister dst, Register src) {
1297   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1298   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
1299   emit_byte(0x2A);
1300   emit_byte(0xC0 | encode);
1301 }
1302 
1303 void Assembler::cvtsi2ssl(XMMRegister dst, Address src) {
1304   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1305   InstructionMark im(this);
1306   simd_prefix(dst, dst, src, VEX_SIMD_F3);
1307   emit_byte(0x2A);
1308   emit_operand(dst, src);
1309 }
1310 
1311 void Assembler::cvtss2sd(XMMRegister dst, XMMRegister src) {
1312   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1313   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
1314   emit_byte(0x5A);
1315   emit_byte(0xC0 | encode);
1316 }
1317 
1318 void Assembler::cvtss2sd(XMMRegister dst, Address src) {
1319   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1320   InstructionMark im(this);
1321   simd_prefix(dst, dst, src, VEX_SIMD_F3);
1322   emit_byte(0x5A);
1323   emit_operand(dst, src);
1324 }
1325 
1326 
1327 void Assembler::cvttsd2sil(Register dst, XMMRegister src) {
1328   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1329   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2);
1330   emit_byte(0x2C);
1331   emit_byte(0xC0 | encode);
1332 }
1333 
1334 void Assembler::cvttss2sil(Register dst, XMMRegister src) {
1335   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1336   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3);
1337   emit_byte(0x2C);
1338   emit_byte(0xC0 | encode);
1339 }
1340 
1341 void Assembler::decl(Address dst) {
1342   // Don't use it directly. Use MacroAssembler::decrement() instead.
1343   InstructionMark im(this);
1344   prefix(dst);
1345   emit_byte(0xFF);
1346   emit_operand(rcx, dst);
1347 }
1348 
1349 void Assembler::divsd(XMMRegister dst, Address src) {
1350   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1351   InstructionMark im(this);
1352   simd_prefix(dst, dst, src, VEX_SIMD_F2);
1353   emit_byte(0x5E);
1354   emit_operand(dst, src);
1355 }
1356 
1357 void Assembler::divsd(XMMRegister dst, XMMRegister src) {
1358   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1359   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
1360   emit_byte(0x5E);
1361   emit_byte(0xC0 | encode);
1362 }
1363 
1364 void Assembler::divss(XMMRegister dst, Address src) {
1365   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1366   InstructionMark im(this);
1367   simd_prefix(dst, dst, src, VEX_SIMD_F3);
1368   emit_byte(0x5E);
1369   emit_operand(dst, src);
1370 }
1371 
1372 void Assembler::divss(XMMRegister dst, XMMRegister src) {
1373   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1374   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
1375   emit_byte(0x5E);
1376   emit_byte(0xC0 | encode);
1377 }
1378 
1379 void Assembler::emms() {
1380   NOT_LP64(assert(VM_Version::supports_mmx(), ""));
1381   emit_byte(0x0F);
1382   emit_byte(0x77);
1383 }
1384 
1385 void Assembler::hlt() {
1386   emit_byte(0xF4);
1387 }
1388 
1389 void Assembler::idivl(Register src) {
1390   int encode = prefix_and_encode(src->encoding());
1391   emit_byte(0xF7);
1392   emit_byte(0xF8 | encode);
1393 }
1394 
1395 void Assembler::divl(Register src) { // Unsigned
1396   int encode = prefix_and_encode(src->encoding());
1397   emit_byte(0xF7);
1398   emit_byte(0xF0 | encode);
1399 }
1400 
1401 void Assembler::imull(Register dst, Register src) {
1402   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1403   emit_byte(0x0F);
1404   emit_byte(0xAF);
1405   emit_byte(0xC0 | encode);
1406 }
1407 
1408 
1409 void Assembler::imull(Register dst, Register src, int value) {
1410   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1411   if (is8bit(value)) {
1412     emit_byte(0x6B);
1413     emit_byte(0xC0 | encode);
1414     emit_byte(value & 0xFF);
1415   } else {
1416     emit_byte(0x69);
1417     emit_byte(0xC0 | encode);
1418     emit_long(value);
1419   }
1420 }
1421 
1422 void Assembler::incl(Address dst) {
1423   // Don't use it directly. Use MacroAssembler::increment() instead.
1424   InstructionMark im(this);
1425   prefix(dst);
1426   emit_byte(0xFF);
1427   emit_operand(rax, dst);
1428 }
1429 
1430 void Assembler::jcc(Condition cc, Label& L, bool maybe_short) {
1431   InstructionMark im(this);
1432   assert((0 <= cc) && (cc < 16), "illegal cc");
1433   if (L.is_bound()) {
1434     address dst = target(L);
1435     assert(dst != NULL, "jcc most probably wrong");
1436 
1437     const int short_size = 2;
1438     const int long_size = 6;
1439     intptr_t offs = (intptr_t)dst - (intptr_t)_code_pos;
1440     if (maybe_short && is8bit(offs - short_size)) {
1441       // 0111 tttn #8-bit disp
1442       emit_byte(0x70 | cc);
1443       emit_byte((offs - short_size) & 0xFF);
1444     } else {
1445       // 0000 1111 1000 tttn #32-bit disp
1446       assert(is_simm32(offs - long_size),
1447              "must be 32bit offset (call4)");
1448       emit_byte(0x0F);
1449       emit_byte(0x80 | cc);
1450       emit_long(offs - long_size);
1451     }
1452   } else {
1453     // Note: could eliminate cond. jumps to this jump if condition
1454     //       is the same however, seems to be rather unlikely case.
1455     // Note: use jccb() if label to be bound is very close to get
1456     //       an 8-bit displacement
1457     L.add_patch_at(code(), locator());
1458     emit_byte(0x0F);
1459     emit_byte(0x80 | cc);
1460     emit_long(0);
1461   }
1462 }
1463 
1464 void Assembler::jccb(Condition cc, Label& L) {
1465   if (L.is_bound()) {
1466     const int short_size = 2;
1467     address entry = target(L);
1468     assert(is8bit((intptr_t)entry - ((intptr_t)_code_pos + short_size)),
1469            "Dispacement too large for a short jmp");
1470     intptr_t offs = (intptr_t)entry - (intptr_t)_code_pos;
1471     // 0111 tttn #8-bit disp
1472     emit_byte(0x70 | cc);
1473     emit_byte((offs - short_size) & 0xFF);
1474   } else {
1475     InstructionMark im(this);
1476     L.add_patch_at(code(), locator());
1477     emit_byte(0x70 | cc);
1478     emit_byte(0);
1479   }
1480 }
1481 
1482 void Assembler::jmp(Address adr) {
1483   InstructionMark im(this);
1484   prefix(adr);
1485   emit_byte(0xFF);
1486   emit_operand(rsp, adr);
1487 }
1488 
1489 void Assembler::jmp(Label& L, bool maybe_short) {
1490   if (L.is_bound()) {
1491     address entry = target(L);
1492     assert(entry != NULL, "jmp most probably wrong");
1493     InstructionMark im(this);
1494     const int short_size = 2;
1495     const int long_size = 5;
1496     intptr_t offs = entry - _code_pos;
1497     if (maybe_short && is8bit(offs - short_size)) {
1498       emit_byte(0xEB);
1499       emit_byte((offs - short_size) & 0xFF);
1500     } else {
1501       emit_byte(0xE9);
1502       emit_long(offs - long_size);
1503     }
1504   } else {
1505     // By default, forward jumps are always 32-bit displacements, since
1506     // we can't yet know where the label will be bound.  If you're sure that
1507     // the forward jump will not run beyond 256 bytes, use jmpb to
1508     // force an 8-bit displacement.
1509     InstructionMark im(this);
1510     L.add_patch_at(code(), locator());
1511     emit_byte(0xE9);
1512     emit_long(0);
1513   }
1514 }
1515 
1516 void Assembler::jmp(Register entry) {
1517   int encode = prefix_and_encode(entry->encoding());
1518   emit_byte(0xFF);
1519   emit_byte(0xE0 | encode);
1520 }
1521 
1522 void Assembler::jmp_literal(address dest, RelocationHolder const& rspec) {
1523   InstructionMark im(this);
1524   emit_byte(0xE9);
1525   assert(dest != NULL, "must have a target");
1526   intptr_t disp = dest - (_code_pos + sizeof(int32_t));
1527   assert(is_simm32(disp), "must be 32bit offset (jmp)");
1528   emit_data(disp, rspec.reloc(), call32_operand);
1529 }
1530 
1531 void Assembler::jmpb(Label& L) {
1532   if (L.is_bound()) {
1533     const int short_size = 2;
1534     address entry = target(L);
1535     assert(is8bit((entry - _code_pos) + short_size),
1536            "Dispacement too large for a short jmp");
1537     assert(entry != NULL, "jmp most probably wrong");
1538     intptr_t offs = entry - _code_pos;
1539     emit_byte(0xEB);
1540     emit_byte((offs - short_size) & 0xFF);
1541   } else {
1542     InstructionMark im(this);
1543     L.add_patch_at(code(), locator());
1544     emit_byte(0xEB);
1545     emit_byte(0);
1546   }
1547 }
1548 
1549 void Assembler::ldmxcsr( Address src) {
1550   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1551   InstructionMark im(this);
1552   prefix(src);
1553   emit_byte(0x0F);
1554   emit_byte(0xAE);
1555   emit_operand(as_Register(2), src);
1556 }
1557 
1558 void Assembler::leal(Register dst, Address src) {
1559   InstructionMark im(this);
1560 #ifdef _LP64
1561   emit_byte(0x67); // addr32
1562   prefix(src, dst);
1563 #endif // LP64
1564   emit_byte(0x8D);
1565   emit_operand(dst, src);
1566 }
1567 
1568 void Assembler::lock() {
1569   if (Atomics & 1) {
1570      // Emit either nothing, a NOP, or a NOP: prefix
1571      emit_byte(0x90) ;
1572   } else {
1573      emit_byte(0xF0);
1574   }
1575 }
1576 
1577 void Assembler::lzcntl(Register dst, Register src) {
1578   assert(VM_Version::supports_lzcnt(), "encoding is treated as BSR");
1579   emit_byte(0xF3);
1580   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1581   emit_byte(0x0F);
1582   emit_byte(0xBD);
1583   emit_byte(0xC0 | encode);
1584 }
1585 
1586 // Emit mfence instruction
1587 void Assembler::mfence() {
1588   NOT_LP64(assert(VM_Version::supports_sse2(), "unsupported");)
1589   emit_byte( 0x0F );
1590   emit_byte( 0xAE );
1591   emit_byte( 0xF0 );
1592 }
1593 
1594 void Assembler::mov(Register dst, Register src) {
1595   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
1596 }
1597 
1598 void Assembler::movapd(XMMRegister dst, XMMRegister src) {
1599   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1600   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
1601   emit_byte(0x28);
1602   emit_byte(0xC0 | encode);
1603 }
1604 
1605 void Assembler::movaps(XMMRegister dst, XMMRegister src) {
1606   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1607   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
1608   emit_byte(0x28);
1609   emit_byte(0xC0 | encode);
1610 }
1611 
1612 void Assembler::movb(Register dst, Address src) {
1613   NOT_LP64(assert(dst->has_byte_register(), "must have byte register"));
1614   InstructionMark im(this);
1615   prefix(src, dst, true);
1616   emit_byte(0x8A);
1617   emit_operand(dst, src);
1618 }
1619 
1620 
1621 void Assembler::movb(Address dst, int imm8) {
1622   InstructionMark im(this);
1623    prefix(dst);
1624   emit_byte(0xC6);
1625   emit_operand(rax, dst, 1);
1626   emit_byte(imm8);
1627 }
1628 
1629 
1630 void Assembler::movb(Address dst, Register src) {
1631   assert(src->has_byte_register(), "must have byte register");
1632   InstructionMark im(this);
1633   prefix(dst, src, true);
1634   emit_byte(0x88);
1635   emit_operand(src, dst);
1636 }
1637 
1638 void Assembler::movdl(XMMRegister dst, Register src) {
1639   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1640   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
1641   emit_byte(0x6E);
1642   emit_byte(0xC0 | encode);
1643 }
1644 
1645 void Assembler::movdl(Register dst, XMMRegister src) {
1646   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1647   // swap src/dst to get correct prefix
1648   int encode = simd_prefix_and_encode(src, dst, VEX_SIMD_66);
1649   emit_byte(0x7E);
1650   emit_byte(0xC0 | encode);
1651 }
1652 
1653 void Assembler::movdl(XMMRegister dst, Address src) {
1654   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1655   InstructionMark im(this);
1656   simd_prefix(dst, src, VEX_SIMD_66);
1657   emit_byte(0x6E);
1658   emit_operand(dst, src);
1659 }
1660 
1661 void Assembler::movdqa(XMMRegister dst, XMMRegister src) {
1662   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1663   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
1664   emit_byte(0x6F);
1665   emit_byte(0xC0 | encode);
1666 }
1667 
1668 void Assembler::movdqu(XMMRegister dst, Address src) {
1669   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1670   InstructionMark im(this);
1671   simd_prefix(dst, src, VEX_SIMD_F3);
1672   emit_byte(0x6F);
1673   emit_operand(dst, src);
1674 }
1675 
1676 void Assembler::movdqu(XMMRegister dst, XMMRegister src) {
1677   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1678   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3);
1679   emit_byte(0x6F);
1680   emit_byte(0xC0 | encode);
1681 }
1682 
1683 void Assembler::movdqu(Address dst, XMMRegister src) {
1684   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1685   InstructionMark im(this);
1686   simd_prefix(dst, src, VEX_SIMD_F3);
1687   emit_byte(0x7F);
1688   emit_operand(src, dst);
1689 }
1690 
1691 // Uses zero extension on 64bit
1692 
1693 void Assembler::movl(Register dst, int32_t imm32) {
1694   int encode = prefix_and_encode(dst->encoding());
1695   emit_byte(0xB8 | encode);
1696   emit_long(imm32);
1697 }
1698 
1699 void Assembler::movl(Register dst, Register src) {
1700   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1701   emit_byte(0x8B);
1702   emit_byte(0xC0 | encode);
1703 }
1704 
1705 void Assembler::movl(Register dst, Address src) {
1706   InstructionMark im(this);
1707   prefix(src, dst);
1708   emit_byte(0x8B);
1709   emit_operand(dst, src);
1710 }
1711 
1712 void Assembler::movl(Address dst, int32_t imm32) {
1713   InstructionMark im(this);
1714   prefix(dst);
1715   emit_byte(0xC7);
1716   emit_operand(rax, dst, 4);
1717   emit_long(imm32);
1718 }
1719 
1720 void Assembler::movl(Address dst, Register src) {
1721   InstructionMark im(this);
1722   prefix(dst, src);
1723   emit_byte(0x89);
1724   emit_operand(src, dst);
1725 }
1726 
1727 // New cpus require to use movsd and movss to avoid partial register stall
1728 // when loading from memory. But for old Opteron use movlpd instead of movsd.
1729 // The selection is done in MacroAssembler::movdbl() and movflt().
1730 void Assembler::movlpd(XMMRegister dst, Address src) {
1731   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1732   InstructionMark im(this);
1733   simd_prefix(dst, dst, src, VEX_SIMD_66);
1734   emit_byte(0x12);
1735   emit_operand(dst, src);
1736 }
1737 
1738 void Assembler::movq( MMXRegister dst, Address src ) {
1739   assert( VM_Version::supports_mmx(), "" );
1740   emit_byte(0x0F);
1741   emit_byte(0x6F);
1742   emit_operand(dst, src);
1743 }
1744 
1745 void Assembler::movq( Address dst, MMXRegister src ) {
1746   assert( VM_Version::supports_mmx(), "" );
1747   emit_byte(0x0F);
1748   emit_byte(0x7F);
1749   // workaround gcc (3.2.1-7a) bug
1750   // In that version of gcc with only an emit_operand(MMX, Address)
1751   // gcc will tail jump and try and reverse the parameters completely
1752   // obliterating dst in the process. By having a version available
1753   // that doesn't need to swap the args at the tail jump the bug is
1754   // avoided.
1755   emit_operand(dst, src);
1756 }
1757 
1758 void Assembler::movq(XMMRegister dst, Address src) {
1759   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1760   InstructionMark im(this);
1761   simd_prefix(dst, src, VEX_SIMD_F3);
1762   emit_byte(0x7E);
1763   emit_operand(dst, src);
1764 }
1765 
1766 void Assembler::movq(Address dst, XMMRegister src) {
1767   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1768   InstructionMark im(this);
1769   simd_prefix(dst, src, VEX_SIMD_66);
1770   emit_byte(0xD6);
1771   emit_operand(src, dst);
1772 }
1773 
1774 void Assembler::movsbl(Register dst, Address src) { // movsxb
1775   InstructionMark im(this);
1776   prefix(src, dst);
1777   emit_byte(0x0F);
1778   emit_byte(0xBE);
1779   emit_operand(dst, src);
1780 }
1781 
1782 void Assembler::movsbl(Register dst, Register src) { // movsxb
1783   NOT_LP64(assert(src->has_byte_register(), "must have byte register"));
1784   int encode = prefix_and_encode(dst->encoding(), src->encoding(), true);
1785   emit_byte(0x0F);
1786   emit_byte(0xBE);
1787   emit_byte(0xC0 | encode);
1788 }
1789 
1790 void Assembler::movsd(XMMRegister dst, XMMRegister src) {
1791   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1792   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
1793   emit_byte(0x10);
1794   emit_byte(0xC0 | encode);
1795 }
1796 
1797 void Assembler::movsd(XMMRegister dst, Address src) {
1798   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1799   InstructionMark im(this);
1800   simd_prefix(dst, src, VEX_SIMD_F2);
1801   emit_byte(0x10);
1802   emit_operand(dst, src);
1803 }
1804 
1805 void Assembler::movsd(Address dst, XMMRegister src) {
1806   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1807   InstructionMark im(this);
1808   simd_prefix(dst, src, VEX_SIMD_F2);
1809   emit_byte(0x11);
1810   emit_operand(src, dst);
1811 }
1812 
1813 void Assembler::movss(XMMRegister dst, XMMRegister src) {
1814   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1815   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
1816   emit_byte(0x10);
1817   emit_byte(0xC0 | encode);
1818 }
1819 
1820 void Assembler::movss(XMMRegister dst, Address src) {
1821   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1822   InstructionMark im(this);
1823   simd_prefix(dst, src, VEX_SIMD_F3);
1824   emit_byte(0x10);
1825   emit_operand(dst, src);
1826 }
1827 
1828 void Assembler::movss(Address dst, XMMRegister src) {
1829   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1830   InstructionMark im(this);
1831   simd_prefix(dst, src, VEX_SIMD_F3);
1832   emit_byte(0x11);
1833   emit_operand(src, dst);
1834 }
1835 
1836 void Assembler::movswl(Register dst, Address src) { // movsxw
1837   InstructionMark im(this);
1838   prefix(src, dst);
1839   emit_byte(0x0F);
1840   emit_byte(0xBF);
1841   emit_operand(dst, src);
1842 }
1843 
1844 void Assembler::movswl(Register dst, Register src) { // movsxw
1845   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1846   emit_byte(0x0F);
1847   emit_byte(0xBF);
1848   emit_byte(0xC0 | encode);
1849 }
1850 
1851 void Assembler::movw(Address dst, int imm16) {
1852   InstructionMark im(this);
1853 
1854   emit_byte(0x66); // switch to 16-bit mode
1855   prefix(dst);
1856   emit_byte(0xC7);
1857   emit_operand(rax, dst, 2);
1858   emit_word(imm16);
1859 }
1860 
1861 void Assembler::movw(Register dst, Address src) {
1862   InstructionMark im(this);
1863   emit_byte(0x66);
1864   prefix(src, dst);
1865   emit_byte(0x8B);
1866   emit_operand(dst, src);
1867 }
1868 
1869 void Assembler::movw(Address dst, Register src) {
1870   InstructionMark im(this);
1871   emit_byte(0x66);
1872   prefix(dst, src);
1873   emit_byte(0x89);
1874   emit_operand(src, dst);
1875 }
1876 
1877 void Assembler::movzbl(Register dst, Address src) { // movzxb
1878   InstructionMark im(this);
1879   prefix(src, dst);
1880   emit_byte(0x0F);
1881   emit_byte(0xB6);
1882   emit_operand(dst, src);
1883 }
1884 
1885 void Assembler::movzbl(Register dst, Register src) { // movzxb
1886   NOT_LP64(assert(src->has_byte_register(), "must have byte register"));
1887   int encode = prefix_and_encode(dst->encoding(), src->encoding(), true);
1888   emit_byte(0x0F);
1889   emit_byte(0xB6);
1890   emit_byte(0xC0 | encode);
1891 }
1892 
1893 void Assembler::movzwl(Register dst, Address src) { // movzxw
1894   InstructionMark im(this);
1895   prefix(src, dst);
1896   emit_byte(0x0F);
1897   emit_byte(0xB7);
1898   emit_operand(dst, src);
1899 }
1900 
1901 void Assembler::movzwl(Register dst, Register src) { // movzxw
1902   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1903   emit_byte(0x0F);
1904   emit_byte(0xB7);
1905   emit_byte(0xC0 | encode);
1906 }
1907 
1908 void Assembler::mull(Address src) {
1909   InstructionMark im(this);
1910   prefix(src);
1911   emit_byte(0xF7);
1912   emit_operand(rsp, src);
1913 }
1914 
1915 void Assembler::mull(Register src) {
1916   int encode = prefix_and_encode(src->encoding());
1917   emit_byte(0xF7);
1918   emit_byte(0xE0 | encode);
1919 }
1920 
1921 void Assembler::mulsd(XMMRegister dst, Address src) {
1922   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1923   InstructionMark im(this);
1924   simd_prefix(dst, dst, src, VEX_SIMD_F2);
1925   emit_byte(0x59);
1926   emit_operand(dst, src);
1927 }
1928 
1929 void Assembler::mulsd(XMMRegister dst, XMMRegister src) {
1930   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1931   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
1932   emit_byte(0x59);
1933   emit_byte(0xC0 | encode);
1934 }
1935 
1936 void Assembler::mulss(XMMRegister dst, Address src) {
1937   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1938   InstructionMark im(this);
1939   simd_prefix(dst, dst, src, VEX_SIMD_F3);
1940   emit_byte(0x59);
1941   emit_operand(dst, src);
1942 }
1943 
1944 void Assembler::mulss(XMMRegister dst, XMMRegister src) {
1945   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1946   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
1947   emit_byte(0x59);
1948   emit_byte(0xC0 | encode);
1949 }
1950 
1951 void Assembler::negl(Register dst) {
1952   int encode = prefix_and_encode(dst->encoding());
1953   emit_byte(0xF7);
1954   emit_byte(0xD8 | encode);
1955 }
1956 
1957 void Assembler::nop(int i) {
1958 #ifdef ASSERT
1959   assert(i > 0, " ");
1960   // The fancy nops aren't currently recognized by debuggers making it a
1961   // pain to disassemble code while debugging. If asserts are on clearly
1962   // speed is not an issue so simply use the single byte traditional nop
1963   // to do alignment.
1964 
1965   for (; i > 0 ; i--) emit_byte(0x90);
1966   return;
1967 
1968 #endif // ASSERT
1969 
1970   if (UseAddressNop && VM_Version::is_intel()) {
1971     //
1972     // Using multi-bytes nops "0x0F 0x1F [address]" for Intel
1973     //  1: 0x90
1974     //  2: 0x66 0x90
1975     //  3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
1976     //  4: 0x0F 0x1F 0x40 0x00
1977     //  5: 0x0F 0x1F 0x44 0x00 0x00
1978     //  6: 0x66 0x0F 0x1F 0x44 0x00 0x00
1979     //  7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
1980     //  8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
1981     //  9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
1982     // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
1983     // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
1984 
1985     // The rest coding is Intel specific - don't use consecutive address nops
1986 
1987     // 12: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
1988     // 13: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
1989     // 14: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
1990     // 15: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
1991 
1992     while(i >= 15) {
1993       // For Intel don't generate consecutive addess nops (mix with regular nops)
1994       i -= 15;
1995       emit_byte(0x66);   // size prefix
1996       emit_byte(0x66);   // size prefix
1997       emit_byte(0x66);   // size prefix
1998       addr_nop_8();
1999       emit_byte(0x66);   // size prefix
2000       emit_byte(0x66);   // size prefix
2001       emit_byte(0x66);   // size prefix
2002       emit_byte(0x90);   // nop
2003     }
2004     switch (i) {
2005       case 14:
2006         emit_byte(0x66); // size prefix
2007       case 13:
2008         emit_byte(0x66); // size prefix
2009       case 12:
2010         addr_nop_8();
2011         emit_byte(0x66); // size prefix
2012         emit_byte(0x66); // size prefix
2013         emit_byte(0x66); // size prefix
2014         emit_byte(0x90); // nop
2015         break;
2016       case 11:
2017         emit_byte(0x66); // size prefix
2018       case 10:
2019         emit_byte(0x66); // size prefix
2020       case 9:
2021         emit_byte(0x66); // size prefix
2022       case 8:
2023         addr_nop_8();
2024         break;
2025       case 7:
2026         addr_nop_7();
2027         break;
2028       case 6:
2029         emit_byte(0x66); // size prefix
2030       case 5:
2031         addr_nop_5();
2032         break;
2033       case 4:
2034         addr_nop_4();
2035         break;
2036       case 3:
2037         // Don't use "0x0F 0x1F 0x00" - need patching safe padding
2038         emit_byte(0x66); // size prefix
2039       case 2:
2040         emit_byte(0x66); // size prefix
2041       case 1:
2042         emit_byte(0x90); // nop
2043         break;
2044       default:
2045         assert(i == 0, " ");
2046     }
2047     return;
2048   }
2049   if (UseAddressNop && VM_Version::is_amd()) {
2050     //
2051     // Using multi-bytes nops "0x0F 0x1F [address]" for AMD.
2052     //  1: 0x90
2053     //  2: 0x66 0x90
2054     //  3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
2055     //  4: 0x0F 0x1F 0x40 0x00
2056     //  5: 0x0F 0x1F 0x44 0x00 0x00
2057     //  6: 0x66 0x0F 0x1F 0x44 0x00 0x00
2058     //  7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2059     //  8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2060     //  9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2061     // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2062     // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2063 
2064     // The rest coding is AMD specific - use consecutive address nops
2065 
2066     // 12: 0x66 0x0F 0x1F 0x44 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2067     // 13: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2068     // 14: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2069     // 15: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2070     // 16: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2071     //     Size prefixes (0x66) are added for larger sizes
2072 
2073     while(i >= 22) {
2074       i -= 11;
2075       emit_byte(0x66); // size prefix
2076       emit_byte(0x66); // size prefix
2077       emit_byte(0x66); // size prefix
2078       addr_nop_8();
2079     }
2080     // Generate first nop for size between 21-12
2081     switch (i) {
2082       case 21:
2083         i -= 1;
2084         emit_byte(0x66); // size prefix
2085       case 20:
2086       case 19:
2087         i -= 1;
2088         emit_byte(0x66); // size prefix
2089       case 18:
2090       case 17:
2091         i -= 1;
2092         emit_byte(0x66); // size prefix
2093       case 16:
2094       case 15:
2095         i -= 8;
2096         addr_nop_8();
2097         break;
2098       case 14:
2099       case 13:
2100         i -= 7;
2101         addr_nop_7();
2102         break;
2103       case 12:
2104         i -= 6;
2105         emit_byte(0x66); // size prefix
2106         addr_nop_5();
2107         break;
2108       default:
2109         assert(i < 12, " ");
2110     }
2111 
2112     // Generate second nop for size between 11-1
2113     switch (i) {
2114       case 11:
2115         emit_byte(0x66); // size prefix
2116       case 10:
2117         emit_byte(0x66); // size prefix
2118       case 9:
2119         emit_byte(0x66); // size prefix
2120       case 8:
2121         addr_nop_8();
2122         break;
2123       case 7:
2124         addr_nop_7();
2125         break;
2126       case 6:
2127         emit_byte(0x66); // size prefix
2128       case 5:
2129         addr_nop_5();
2130         break;
2131       case 4:
2132         addr_nop_4();
2133         break;
2134       case 3:
2135         // Don't use "0x0F 0x1F 0x00" - need patching safe padding
2136         emit_byte(0x66); // size prefix
2137       case 2:
2138         emit_byte(0x66); // size prefix
2139       case 1:
2140         emit_byte(0x90); // nop
2141         break;
2142       default:
2143         assert(i == 0, " ");
2144     }
2145     return;
2146   }
2147 
2148   // Using nops with size prefixes "0x66 0x90".
2149   // From AMD Optimization Guide:
2150   //  1: 0x90
2151   //  2: 0x66 0x90
2152   //  3: 0x66 0x66 0x90
2153   //  4: 0x66 0x66 0x66 0x90
2154   //  5: 0x66 0x66 0x90 0x66 0x90
2155   //  6: 0x66 0x66 0x90 0x66 0x66 0x90
2156   //  7: 0x66 0x66 0x66 0x90 0x66 0x66 0x90
2157   //  8: 0x66 0x66 0x66 0x90 0x66 0x66 0x66 0x90
2158   //  9: 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2159   // 10: 0x66 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2160   //
2161   while(i > 12) {
2162     i -= 4;
2163     emit_byte(0x66); // size prefix
2164     emit_byte(0x66);
2165     emit_byte(0x66);
2166     emit_byte(0x90); // nop
2167   }
2168   // 1 - 12 nops
2169   if(i > 8) {
2170     if(i > 9) {
2171       i -= 1;
2172       emit_byte(0x66);
2173     }
2174     i -= 3;
2175     emit_byte(0x66);
2176     emit_byte(0x66);
2177     emit_byte(0x90);
2178   }
2179   // 1 - 8 nops
2180   if(i > 4) {
2181     if(i > 6) {
2182       i -= 1;
2183       emit_byte(0x66);
2184     }
2185     i -= 3;
2186     emit_byte(0x66);
2187     emit_byte(0x66);
2188     emit_byte(0x90);
2189   }
2190   switch (i) {
2191     case 4:
2192       emit_byte(0x66);
2193     case 3:
2194       emit_byte(0x66);
2195     case 2:
2196       emit_byte(0x66);
2197     case 1:
2198       emit_byte(0x90);
2199       break;
2200     default:
2201       assert(i == 0, " ");
2202   }
2203 }
2204 
2205 void Assembler::notl(Register dst) {
2206   int encode = prefix_and_encode(dst->encoding());
2207   emit_byte(0xF7);
2208   emit_byte(0xD0 | encode );
2209 }
2210 
2211 void Assembler::orl(Address dst, int32_t imm32) {
2212   InstructionMark im(this);
2213   prefix(dst);
2214   emit_arith_operand(0x81, rcx, dst, imm32);
2215 }
2216 
2217 void Assembler::orl(Register dst, int32_t imm32) {
2218   prefix(dst);
2219   emit_arith(0x81, 0xC8, dst, imm32);
2220 }
2221 
2222 void Assembler::orl(Register dst, Address src) {
2223   InstructionMark im(this);
2224   prefix(src, dst);
2225   emit_byte(0x0B);
2226   emit_operand(dst, src);
2227 }
2228 
2229 void Assembler::orl(Register dst, Register src) {
2230   (void) prefix_and_encode(dst->encoding(), src->encoding());
2231   emit_arith(0x0B, 0xC0, dst, src);
2232 }
2233 
2234 void Assembler::packuswb(XMMRegister dst, Address src) {
2235   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2236   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2237   InstructionMark im(this);
2238   simd_prefix(dst, dst, src, VEX_SIMD_66);
2239   emit_byte(0x67);
2240   emit_operand(dst, src);
2241 }
2242 
2243 void Assembler::packuswb(XMMRegister dst, XMMRegister src) {
2244   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2245   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
2246   emit_byte(0x67);
2247   emit_byte(0xC0 | encode);
2248 }
2249 
2250 void Assembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
2251   assert(VM_Version::supports_sse4_2(), "");
2252   InstructionMark im(this);
2253   simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
2254   emit_byte(0x61);
2255   emit_operand(dst, src);
2256   emit_byte(imm8);
2257 }
2258 
2259 void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
2260   assert(VM_Version::supports_sse4_2(), "");
2261   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
2262   emit_byte(0x61);
2263   emit_byte(0xC0 | encode);
2264   emit_byte(imm8);
2265 }
2266 
2267 void Assembler::pmovzxbw(XMMRegister dst, Address src) {
2268   assert(VM_Version::supports_sse4_1(), "");
2269   InstructionMark im(this);
2270   simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
2271   emit_byte(0x30);
2272   emit_operand(dst, src);
2273 }
2274 
2275 void Assembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
2276   assert(VM_Version::supports_sse4_1(), "");
2277   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
2278   emit_byte(0x30);
2279   emit_byte(0xC0 | encode);
2280 }
2281 
2282 // generic
2283 void Assembler::pop(Register dst) {
2284   int encode = prefix_and_encode(dst->encoding());
2285   emit_byte(0x58 | encode);
2286 }
2287 
2288 void Assembler::popcntl(Register dst, Address src) {
2289   assert(VM_Version::supports_popcnt(), "must support");
2290   InstructionMark im(this);
2291   emit_byte(0xF3);
2292   prefix(src, dst);
2293   emit_byte(0x0F);
2294   emit_byte(0xB8);
2295   emit_operand(dst, src);
2296 }
2297 
2298 void Assembler::popcntl(Register dst, Register src) {
2299   assert(VM_Version::supports_popcnt(), "must support");
2300   emit_byte(0xF3);
2301   int encode = prefix_and_encode(dst->encoding(), src->encoding());
2302   emit_byte(0x0F);
2303   emit_byte(0xB8);
2304   emit_byte(0xC0 | encode);
2305 }
2306 
2307 void Assembler::popf() {
2308   emit_byte(0x9D);
2309 }
2310 
2311 #ifndef _LP64 // no 32bit push/pop on amd64
2312 void Assembler::popl(Address dst) {
2313   // NOTE: this will adjust stack by 8byte on 64bits
2314   InstructionMark im(this);
2315   prefix(dst);
2316   emit_byte(0x8F);
2317   emit_operand(rax, dst);
2318 }
2319 #endif
2320 
2321 void Assembler::prefetch_prefix(Address src) {
2322   prefix(src);
2323   emit_byte(0x0F);
2324 }
2325 
2326 void Assembler::prefetchnta(Address src) {
2327   NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2328   InstructionMark im(this);
2329   prefetch_prefix(src);
2330   emit_byte(0x18);
2331   emit_operand(rax, src); // 0, src
2332 }
2333 
2334 void Assembler::prefetchr(Address src) {
2335   assert(VM_Version::supports_3dnow_prefetch(), "must support");
2336   InstructionMark im(this);
2337   prefetch_prefix(src);
2338   emit_byte(0x0D);
2339   emit_operand(rax, src); // 0, src
2340 }
2341 
2342 void Assembler::prefetcht0(Address src) {
2343   NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2344   InstructionMark im(this);
2345   prefetch_prefix(src);
2346   emit_byte(0x18);
2347   emit_operand(rcx, src); // 1, src
2348 }
2349 
2350 void Assembler::prefetcht1(Address src) {
2351   NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2352   InstructionMark im(this);
2353   prefetch_prefix(src);
2354   emit_byte(0x18);
2355   emit_operand(rdx, src); // 2, src
2356 }
2357 
2358 void Assembler::prefetcht2(Address src) {
2359   NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2360   InstructionMark im(this);
2361   prefetch_prefix(src);
2362   emit_byte(0x18);
2363   emit_operand(rbx, src); // 3, src
2364 }
2365 
2366 void Assembler::prefetchw(Address src) {
2367   assert(VM_Version::supports_3dnow_prefetch(), "must support");
2368   InstructionMark im(this);
2369   prefetch_prefix(src);
2370   emit_byte(0x0D);
2371   emit_operand(rcx, src); // 1, src
2372 }
2373 
2374 void Assembler::prefix(Prefix p) {
2375   a_byte(p);
2376 }
2377 
2378 void Assembler::por(XMMRegister dst, XMMRegister src) {
2379   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2380   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
2381   emit_byte(0xEB);
2382   emit_byte(0xC0 | encode);
2383 }
2384 
2385 void Assembler::por(XMMRegister dst, Address src) {
2386   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2387   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2388   InstructionMark im(this);
2389   simd_prefix(dst, dst, src, VEX_SIMD_66);
2390   emit_byte(0xEB);
2391   emit_operand(dst, src);
2392 }
2393 
2394 void Assembler::pshufd(XMMRegister dst, XMMRegister src, int mode) {
2395   assert(isByte(mode), "invalid value");
2396   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2397   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
2398   emit_byte(0x70);
2399   emit_byte(0xC0 | encode);
2400   emit_byte(mode & 0xFF);
2401 
2402 }
2403 
2404 void Assembler::pshufd(XMMRegister dst, Address src, int mode) {
2405   assert(isByte(mode), "invalid value");
2406   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2407   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2408   InstructionMark im(this);
2409   simd_prefix(dst, src, VEX_SIMD_66);
2410   emit_byte(0x70);
2411   emit_operand(dst, src);
2412   emit_byte(mode & 0xFF);
2413 }
2414 
2415 void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
2416   assert(isByte(mode), "invalid value");
2417   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2418   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2);
2419   emit_byte(0x70);
2420   emit_byte(0xC0 | encode);
2421   emit_byte(mode & 0xFF);
2422 }
2423 
2424 void Assembler::pshuflw(XMMRegister dst, Address src, int mode) {
2425   assert(isByte(mode), "invalid value");
2426   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2427   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2428   InstructionMark im(this);
2429   simd_prefix(dst, src, VEX_SIMD_F2);
2430   emit_byte(0x70);
2431   emit_operand(dst, src);
2432   emit_byte(mode & 0xFF);
2433 }
2434 
2435 void Assembler::psrlq(XMMRegister dst, int shift) {
2436   // Shift 64 bit value logically right by specified number of bits.
2437   // HMM Table D-1 says sse2 or mmx.
2438   // Do not confuse it with psrldq SSE2 instruction which
2439   // shifts 128 bit value in xmm register by number of bytes.
2440   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2441   int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66);
2442   emit_byte(0x73);
2443   emit_byte(0xC0 | encode);
2444   emit_byte(shift);
2445 }
2446 
2447 void Assembler::psrldq(XMMRegister dst, int shift) {
2448   // Shift 128 bit value in xmm register by number of bytes.
2449   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2450   int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66);
2451   emit_byte(0x73);
2452   emit_byte(0xC0 | encode);
2453   emit_byte(shift);
2454 }
2455 
2456 void Assembler::ptest(XMMRegister dst, Address src) {
2457   assert(VM_Version::supports_sse4_1(), "");
2458   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2459   InstructionMark im(this);
2460   simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
2461   emit_byte(0x17);
2462   emit_operand(dst, src);
2463 }
2464 
2465 void Assembler::ptest(XMMRegister dst, XMMRegister src) {
2466   assert(VM_Version::supports_sse4_1(), "");
2467   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
2468   emit_byte(0x17);
2469   emit_byte(0xC0 | encode);
2470 }
2471 
2472 void Assembler::punpcklbw(XMMRegister dst, Address src) {
2473   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2474   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2475   InstructionMark im(this);
2476   simd_prefix(dst, dst, src, VEX_SIMD_66);
2477   emit_byte(0x60);
2478   emit_operand(dst, src);
2479 }
2480 
2481 void Assembler::punpcklbw(XMMRegister dst, XMMRegister src) {
2482   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2483   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
2484   emit_byte(0x60);
2485   emit_byte(0xC0 | encode);
2486 }
2487 
2488 void Assembler::punpckldq(XMMRegister dst, Address src) {
2489   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2490   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2491   InstructionMark im(this);
2492   simd_prefix(dst, dst, src, VEX_SIMD_66);
2493   emit_byte(0x62);
2494   emit_operand(dst, src);
2495 }
2496 
2497 void Assembler::punpckldq(XMMRegister dst, XMMRegister src) {
2498   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2499   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
2500   emit_byte(0x62);
2501   emit_byte(0xC0 | encode);
2502 }
2503 
2504 void Assembler::push(int32_t imm32) {
2505   // in 64bits we push 64bits onto the stack but only
2506   // take a 32bit immediate
2507   emit_byte(0x68);
2508   emit_long(imm32);
2509 }
2510 
2511 void Assembler::push(Register src) {
2512   int encode = prefix_and_encode(src->encoding());
2513 
2514   emit_byte(0x50 | encode);
2515 }
2516 
2517 void Assembler::pushf() {
2518   emit_byte(0x9C);
2519 }
2520 
2521 #ifndef _LP64 // no 32bit push/pop on amd64
2522 void Assembler::pushl(Address src) {
2523   // Note this will push 64bit on 64bit
2524   InstructionMark im(this);
2525   prefix(src);
2526   emit_byte(0xFF);
2527   emit_operand(rsi, src);
2528 }
2529 #endif
2530 
2531 void Assembler::pxor(XMMRegister dst, Address src) {
2532   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2533   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2534   InstructionMark im(this);
2535   simd_prefix(dst, dst, src, VEX_SIMD_66);
2536   emit_byte(0xEF);
2537   emit_operand(dst, src);
2538 }
2539 
2540 void Assembler::pxor(XMMRegister dst, XMMRegister src) {
2541   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2542   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
2543   emit_byte(0xEF);
2544   emit_byte(0xC0 | encode);
2545 }
2546 
2547 void Assembler::rcll(Register dst, int imm8) {
2548   assert(isShiftCount(imm8), "illegal shift count");
2549   int encode = prefix_and_encode(dst->encoding());
2550   if (imm8 == 1) {
2551     emit_byte(0xD1);
2552     emit_byte(0xD0 | encode);
2553   } else {
2554     emit_byte(0xC1);
2555     emit_byte(0xD0 | encode);
2556     emit_byte(imm8);
2557   }
2558 }
2559 
2560 // copies data from [esi] to [edi] using rcx pointer sized words
2561 // generic
2562 void Assembler::rep_mov() {
2563   emit_byte(0xF3);
2564   // MOVSQ
2565   LP64_ONLY(prefix(REX_W));
2566   emit_byte(0xA5);
2567 }
2568 
2569 // sets rcx pointer sized words with rax, value at [edi]
2570 // generic
2571 void Assembler::rep_set() { // rep_set
2572   emit_byte(0xF3);
2573   // STOSQ
2574   LP64_ONLY(prefix(REX_W));
2575   emit_byte(0xAB);
2576 }
2577 
2578 // scans rcx pointer sized words at [edi] for occurance of rax,
2579 // generic
2580 void Assembler::repne_scan() { // repne_scan
2581   emit_byte(0xF2);
2582   // SCASQ
2583   LP64_ONLY(prefix(REX_W));
2584   emit_byte(0xAF);
2585 }
2586 
2587 #ifdef _LP64
2588 // scans rcx 4 byte words at [edi] for occurance of rax,
2589 // generic
2590 void Assembler::repne_scanl() { // repne_scan
2591   emit_byte(0xF2);
2592   // SCASL
2593   emit_byte(0xAF);
2594 }
2595 #endif
2596 
2597 void Assembler::ret(int imm16) {
2598   if (imm16 == 0) {
2599     emit_byte(0xC3);
2600   } else {
2601     emit_byte(0xC2);
2602     emit_word(imm16);
2603   }
2604 }
2605 
2606 void Assembler::sahf() {
2607 #ifdef _LP64
2608   // Not supported in 64bit mode
2609   ShouldNotReachHere();
2610 #endif
2611   emit_byte(0x9E);
2612 }
2613 
2614 void Assembler::sarl(Register dst, int imm8) {
2615   int encode = prefix_and_encode(dst->encoding());
2616   assert(isShiftCount(imm8), "illegal shift count");
2617   if (imm8 == 1) {
2618     emit_byte(0xD1);
2619     emit_byte(0xF8 | encode);
2620   } else {
2621     emit_byte(0xC1);
2622     emit_byte(0xF8 | encode);
2623     emit_byte(imm8);
2624   }
2625 }
2626 
2627 void Assembler::sarl(Register dst) {
2628   int encode = prefix_and_encode(dst->encoding());
2629   emit_byte(0xD3);
2630   emit_byte(0xF8 | encode);
2631 }
2632 
2633 void Assembler::sbbl(Address dst, int32_t imm32) {
2634   InstructionMark im(this);
2635   prefix(dst);
2636   emit_arith_operand(0x81, rbx, dst, imm32);
2637 }
2638 
2639 void Assembler::sbbl(Register dst, int32_t imm32) {
2640   prefix(dst);
2641   emit_arith(0x81, 0xD8, dst, imm32);
2642 }
2643 
2644 
2645 void Assembler::sbbl(Register dst, Address src) {
2646   InstructionMark im(this);
2647   prefix(src, dst);
2648   emit_byte(0x1B);
2649   emit_operand(dst, src);
2650 }
2651 
2652 void Assembler::sbbl(Register dst, Register src) {
2653   (void) prefix_and_encode(dst->encoding(), src->encoding());
2654   emit_arith(0x1B, 0xC0, dst, src);
2655 }
2656 
2657 void Assembler::setb(Condition cc, Register dst) {
2658   assert(0 <= cc && cc < 16, "illegal cc");
2659   int encode = prefix_and_encode(dst->encoding(), true);
2660   emit_byte(0x0F);
2661   emit_byte(0x90 | cc);
2662   emit_byte(0xC0 | encode);
2663 }
2664 
2665 void Assembler::shll(Register dst, int imm8) {
2666   assert(isShiftCount(imm8), "illegal shift count");
2667   int encode = prefix_and_encode(dst->encoding());
2668   if (imm8 == 1 ) {
2669     emit_byte(0xD1);
2670     emit_byte(0xE0 | encode);
2671   } else {
2672     emit_byte(0xC1);
2673     emit_byte(0xE0 | encode);
2674     emit_byte(imm8);
2675   }
2676 }
2677 
2678 void Assembler::shll(Register dst) {
2679   int encode = prefix_and_encode(dst->encoding());
2680   emit_byte(0xD3);
2681   emit_byte(0xE0 | encode);
2682 }
2683 
2684 void Assembler::shrl(Register dst, int imm8) {
2685   assert(isShiftCount(imm8), "illegal shift count");
2686   int encode = prefix_and_encode(dst->encoding());
2687   emit_byte(0xC1);
2688   emit_byte(0xE8 | encode);
2689   emit_byte(imm8);
2690 }
2691 
2692 void Assembler::shrl(Register dst) {
2693   int encode = prefix_and_encode(dst->encoding());
2694   emit_byte(0xD3);
2695   emit_byte(0xE8 | encode);
2696 }
2697 
2698 // copies a single word from [esi] to [edi]
2699 void Assembler::smovl() {
2700   emit_byte(0xA5);
2701 }
2702 
2703 void Assembler::sqrtsd(XMMRegister dst, XMMRegister src) {
2704   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2705   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
2706   emit_byte(0x51);
2707   emit_byte(0xC0 | encode);
2708 }
2709 
2710 void Assembler::sqrtsd(XMMRegister dst, Address src) {
2711   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2712   InstructionMark im(this);
2713   simd_prefix(dst, dst, src, VEX_SIMD_F2);
2714   emit_byte(0x51);
2715   emit_operand(dst, src);
2716 }
2717 
2718 void Assembler::sqrtss(XMMRegister dst, XMMRegister src) {
2719   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2720   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
2721   emit_byte(0x51);
2722   emit_byte(0xC0 | encode);
2723 }
2724 
2725 void Assembler::sqrtss(XMMRegister dst, Address src) {
2726   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2727   InstructionMark im(this);
2728   simd_prefix(dst, dst, src, VEX_SIMD_F3);
2729   emit_byte(0x51);
2730   emit_operand(dst, src);
2731 }
2732 
2733 void Assembler::stmxcsr( Address dst) {
2734   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2735   InstructionMark im(this);
2736   prefix(dst);
2737   emit_byte(0x0F);
2738   emit_byte(0xAE);
2739   emit_operand(as_Register(3), dst);
2740 }
2741 
2742 void Assembler::subl(Address dst, int32_t imm32) {
2743   InstructionMark im(this);
2744   prefix(dst);
2745   emit_arith_operand(0x81, rbp, dst, imm32);
2746 }
2747 
2748 void Assembler::subl(Address dst, Register src) {
2749   InstructionMark im(this);
2750   prefix(dst, src);
2751   emit_byte(0x29);
2752   emit_operand(src, dst);
2753 }
2754 
2755 void Assembler::subl(Register dst, int32_t imm32) {
2756   prefix(dst);
2757   emit_arith(0x81, 0xE8, dst, imm32);
2758 }
2759 
2760 void Assembler::subl(Register dst, Address src) {
2761   InstructionMark im(this);
2762   prefix(src, dst);
2763   emit_byte(0x2B);
2764   emit_operand(dst, src);
2765 }
2766 
2767 void Assembler::subl(Register dst, Register src) {
2768   (void) prefix_and_encode(dst->encoding(), src->encoding());
2769   emit_arith(0x2B, 0xC0, dst, src);
2770 }
2771 
2772 void Assembler::subsd(XMMRegister dst, XMMRegister src) {
2773   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2774   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
2775   emit_byte(0x5C);
2776   emit_byte(0xC0 | encode);
2777 }
2778 
2779 void Assembler::subsd(XMMRegister dst, Address src) {
2780   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2781   InstructionMark im(this);
2782   simd_prefix(dst, dst, src, VEX_SIMD_F2);
2783   emit_byte(0x5C);
2784   emit_operand(dst, src);
2785 }
2786 
2787 void Assembler::subss(XMMRegister dst, XMMRegister src) {
2788   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2789   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
2790   emit_byte(0x5C);
2791   emit_byte(0xC0 | encode);
2792 }
2793 
2794 void Assembler::subss(XMMRegister dst, Address src) {
2795   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2796   InstructionMark im(this);
2797   simd_prefix(dst, dst, src, VEX_SIMD_F3);
2798   emit_byte(0x5C);
2799   emit_operand(dst, src);
2800 }
2801 
2802 void Assembler::testb(Register dst, int imm8) {
2803   NOT_LP64(assert(dst->has_byte_register(), "must have byte register"));
2804   (void) prefix_and_encode(dst->encoding(), true);
2805   emit_arith_b(0xF6, 0xC0, dst, imm8);
2806 }
2807 
2808 void Assembler::testl(Register dst, int32_t imm32) {
2809   // not using emit_arith because test
2810   // doesn't support sign-extension of
2811   // 8bit operands
2812   int encode = dst->encoding();
2813   if (encode == 0) {
2814     emit_byte(0xA9);
2815   } else {
2816     encode = prefix_and_encode(encode);
2817     emit_byte(0xF7);
2818     emit_byte(0xC0 | encode);
2819   }
2820   emit_long(imm32);
2821 }
2822 
2823 void Assembler::testl(Register dst, Register src) {
2824   (void) prefix_and_encode(dst->encoding(), src->encoding());
2825   emit_arith(0x85, 0xC0, dst, src);
2826 }
2827 
2828 void Assembler::testl(Register dst, Address  src) {
2829   InstructionMark im(this);
2830   prefix(src, dst);
2831   emit_byte(0x85);
2832   emit_operand(dst, src);
2833 }
2834 
2835 void Assembler::ucomisd(XMMRegister dst, Address src) {
2836   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2837   InstructionMark im(this);
2838   simd_prefix(dst, src, VEX_SIMD_66);
2839   emit_byte(0x2E);
2840   emit_operand(dst, src);
2841 }
2842 
2843 void Assembler::ucomisd(XMMRegister dst, XMMRegister src) {
2844   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2845   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
2846   emit_byte(0x2E);
2847   emit_byte(0xC0 | encode);
2848 }
2849 
2850 void Assembler::ucomiss(XMMRegister dst, Address src) {
2851   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2852   InstructionMark im(this);
2853   simd_prefix(dst, src, VEX_SIMD_NONE);
2854   emit_byte(0x2E);
2855   emit_operand(dst, src);
2856 }
2857 
2858 void Assembler::ucomiss(XMMRegister dst, XMMRegister src) {
2859   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2860   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
2861   emit_byte(0x2E);
2862   emit_byte(0xC0 | encode);
2863 }
2864 
2865 
2866 void Assembler::xaddl(Address dst, Register src) {
2867   InstructionMark im(this);
2868   prefix(dst, src);
2869   emit_byte(0x0F);
2870   emit_byte(0xC1);
2871   emit_operand(src, dst);
2872 }
2873 
2874 void Assembler::xchgl(Register dst, Address src) { // xchg
2875   InstructionMark im(this);
2876   prefix(src, dst);
2877   emit_byte(0x87);
2878   emit_operand(dst, src);
2879 }
2880 
2881 void Assembler::xchgl(Register dst, Register src) {
2882   int encode = prefix_and_encode(dst->encoding(), src->encoding());
2883   emit_byte(0x87);
2884   emit_byte(0xc0 | encode);
2885 }
2886 
2887 void Assembler::xorl(Register dst, int32_t imm32) {
2888   prefix(dst);
2889   emit_arith(0x81, 0xF0, dst, imm32);
2890 }
2891 
2892 void Assembler::xorl(Register dst, Address src) {
2893   InstructionMark im(this);
2894   prefix(src, dst);
2895   emit_byte(0x33);
2896   emit_operand(dst, src);
2897 }
2898 
2899 void Assembler::xorl(Register dst, Register src) {
2900   (void) prefix_and_encode(dst->encoding(), src->encoding());
2901   emit_arith(0x33, 0xC0, dst, src);
2902 }
2903 
2904 void Assembler::xorpd(XMMRegister dst, XMMRegister src) {
2905   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2906   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
2907   emit_byte(0x57);
2908   emit_byte(0xC0 | encode);
2909 }
2910 
2911 void Assembler::xorpd(XMMRegister dst, Address src) {
2912   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2913   InstructionMark im(this);
2914   simd_prefix(dst, dst, src, VEX_SIMD_66);
2915   emit_byte(0x57);
2916   emit_operand(dst, src);
2917 }
2918 
2919 
2920 void Assembler::xorps(XMMRegister dst, XMMRegister src) {
2921   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2922   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_NONE);
2923   emit_byte(0x57);
2924   emit_byte(0xC0 | encode);
2925 }
2926 
2927 void Assembler::xorps(XMMRegister dst, Address src) {
2928   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2929   InstructionMark im(this);
2930   simd_prefix(dst, dst, src, VEX_SIMD_NONE);
2931   emit_byte(0x57);
2932   emit_operand(dst, src);
2933 }
2934 
2935 // AVX 3-operands non destructive source instructions (encoded with VEX prefix)
2936 
2937 void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, Address src) {
2938   assert(VM_Version::supports_avx(), "");
2939   InstructionMark im(this);
2940   vex_prefix(dst, nds, src, VEX_SIMD_F2);
2941   emit_byte(0x58);
2942   emit_operand(dst, src);
2943 }
2944 
2945 void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2946   assert(VM_Version::supports_avx(), "");
2947   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
2948   emit_byte(0x58);
2949   emit_byte(0xC0 | encode);
2950 }
2951 
2952 void Assembler::vaddss(XMMRegister dst, XMMRegister nds, Address src) {
2953   assert(VM_Version::supports_avx(), "");
2954   InstructionMark im(this);
2955   vex_prefix(dst, nds, src, VEX_SIMD_F3);
2956   emit_byte(0x58);
2957   emit_operand(dst, src);
2958 }
2959 
2960 void Assembler::vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2961   assert(VM_Version::supports_avx(), "");
2962   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
2963   emit_byte(0x58);
2964   emit_byte(0xC0 | encode);
2965 }
2966 
2967 void Assembler::vandpd(XMMRegister dst, XMMRegister nds, Address src) {
2968   assert(VM_Version::supports_avx(), "");
2969   InstructionMark im(this);
2970   vex_prefix(dst, nds, src, VEX_SIMD_66);
2971   emit_byte(0x54);
2972   emit_operand(dst, src);
2973 }
2974 
2975 void Assembler::vandps(XMMRegister dst, XMMRegister nds, Address src) {
2976   assert(VM_Version::supports_avx(), "");
2977   InstructionMark im(this);
2978   vex_prefix(dst, nds, src, VEX_SIMD_NONE);
2979   emit_byte(0x54);
2980   emit_operand(dst, src);
2981 }
2982 
2983 void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, Address src) {
2984   assert(VM_Version::supports_avx(), "");
2985   InstructionMark im(this);
2986   vex_prefix(dst, nds, src, VEX_SIMD_F2);
2987   emit_byte(0x5E);
2988   emit_operand(dst, src);
2989 }
2990 
2991 void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2992   assert(VM_Version::supports_avx(), "");
2993   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
2994   emit_byte(0x5E);
2995   emit_byte(0xC0 | encode);
2996 }
2997 
2998 void Assembler::vdivss(XMMRegister dst, XMMRegister nds, Address src) {
2999   assert(VM_Version::supports_avx(), "");
3000   InstructionMark im(this);
3001   vex_prefix(dst, nds, src, VEX_SIMD_F3);
3002   emit_byte(0x5E);
3003   emit_operand(dst, src);
3004 }
3005 
3006 void Assembler::vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3007   assert(VM_Version::supports_avx(), "");
3008   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
3009   emit_byte(0x5E);
3010   emit_byte(0xC0 | encode);
3011 }
3012 
3013 void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) {
3014   assert(VM_Version::supports_avx(), "");
3015   InstructionMark im(this);
3016   vex_prefix(dst, nds, src, VEX_SIMD_F2);
3017   emit_byte(0x59);
3018   emit_operand(dst, src);
3019 }
3020 
3021 void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3022   assert(VM_Version::supports_avx(), "");
3023   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
3024   emit_byte(0x59);
3025   emit_byte(0xC0 | encode);
3026 }
3027 
3028 void Assembler::vmulss(XMMRegister dst, XMMRegister nds, Address src) {
3029   InstructionMark im(this);
3030   vex_prefix(dst, nds, src, VEX_SIMD_F3);
3031   emit_byte(0x59);
3032   emit_operand(dst, src);
3033 }
3034 
3035 void Assembler::vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3036   assert(VM_Version::supports_avx(), "");
3037   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
3038   emit_byte(0x59);
3039   emit_byte(0xC0 | encode);
3040 }
3041 
3042 
3043 void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, Address src) {
3044   assert(VM_Version::supports_avx(), "");
3045   InstructionMark im(this);
3046   vex_prefix(dst, nds, src, VEX_SIMD_F2);
3047   emit_byte(0x5C);
3048   emit_operand(dst, src);
3049 }
3050 
3051 void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3052   assert(VM_Version::supports_avx(), "");
3053   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
3054   emit_byte(0x5C);
3055   emit_byte(0xC0 | encode);
3056 }
3057 
3058 void Assembler::vsubss(XMMRegister dst, XMMRegister nds, Address src) {
3059   assert(VM_Version::supports_avx(), "");
3060   InstructionMark im(this);
3061   vex_prefix(dst, nds, src, VEX_SIMD_F3);
3062   emit_byte(0x5C);
3063   emit_operand(dst, src);
3064 }
3065 
3066 void Assembler::vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3067   assert(VM_Version::supports_avx(), "");
3068   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
3069   emit_byte(0x5C);
3070   emit_byte(0xC0 | encode);
3071 }
3072 
3073 void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, Address src) {
3074   assert(VM_Version::supports_avx(), "");
3075   InstructionMark im(this);
3076   vex_prefix(dst, nds, src, VEX_SIMD_66);
3077   emit_byte(0x57);
3078   emit_operand(dst, src);
3079 }
3080 
3081 void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src) {
3082   assert(VM_Version::supports_avx(), "");
3083   InstructionMark im(this);
3084   vex_prefix(dst, nds, src, VEX_SIMD_NONE);
3085   emit_byte(0x57);
3086   emit_operand(dst, src);
3087 }
3088 
3089 
3090 #ifndef _LP64
3091 // 32bit only pieces of the assembler
3092 
3093 void Assembler::cmp_literal32(Register src1, int32_t imm32, RelocationHolder const& rspec) {
3094   // NO PREFIX AS NEVER 64BIT
3095   InstructionMark im(this);
3096   emit_byte(0x81);
3097   emit_byte(0xF8 | src1->encoding());
3098   emit_data(imm32, rspec, 0);
3099 }
3100 
3101 void Assembler::cmp_literal32(Address src1, int32_t imm32, RelocationHolder const& rspec) {
3102   // NO PREFIX AS NEVER 64BIT (not even 32bit versions of 64bit regs
3103   InstructionMark im(this);
3104   emit_byte(0x81);
3105   emit_operand(rdi, src1);
3106   emit_data(imm32, rspec, 0);
3107 }
3108 
3109 // The 64-bit (32bit platform) cmpxchg compares the value at adr with the contents of rdx:rax,
3110 // and stores rcx:rbx into adr if so; otherwise, the value at adr is loaded
3111 // into rdx:rax.  The ZF is set if the compared values were equal, and cleared otherwise.
3112 void Assembler::cmpxchg8(Address adr) {
3113   InstructionMark im(this);
3114   emit_byte(0x0F);
3115   emit_byte(0xc7);
3116   emit_operand(rcx, adr);
3117 }
3118 
3119 void Assembler::decl(Register dst) {
3120   // Don't use it directly. Use MacroAssembler::decrementl() instead.
3121  emit_byte(0x48 | dst->encoding());
3122 }
3123 
3124 #endif // _LP64
3125 
3126 // 64bit typically doesn't use the x87 but needs to for the trig funcs
3127 
3128 void Assembler::fabs() {
3129   emit_byte(0xD9);
3130   emit_byte(0xE1);
3131 }
3132 
3133 void Assembler::fadd(int i) {
3134   emit_farith(0xD8, 0xC0, i);
3135 }
3136 
3137 void Assembler::fadd_d(Address src) {
3138   InstructionMark im(this);
3139   emit_byte(0xDC);
3140   emit_operand32(rax, src);
3141 }
3142 
3143 void Assembler::fadd_s(Address src) {
3144   InstructionMark im(this);
3145   emit_byte(0xD8);
3146   emit_operand32(rax, src);
3147 }
3148 
3149 void Assembler::fadda(int i) {
3150   emit_farith(0xDC, 0xC0, i);
3151 }
3152 
3153 void Assembler::faddp(int i) {
3154   emit_farith(0xDE, 0xC0, i);
3155 }
3156 
3157 void Assembler::fchs() {
3158   emit_byte(0xD9);
3159   emit_byte(0xE0);
3160 }
3161 
3162 void Assembler::fcom(int i) {
3163   emit_farith(0xD8, 0xD0, i);
3164 }
3165 
3166 void Assembler::fcomp(int i) {
3167   emit_farith(0xD8, 0xD8, i);
3168 }
3169 
3170 void Assembler::fcomp_d(Address src) {
3171   InstructionMark im(this);
3172   emit_byte(0xDC);
3173   emit_operand32(rbx, src);
3174 }
3175 
3176 void Assembler::fcomp_s(Address src) {
3177   InstructionMark im(this);
3178   emit_byte(0xD8);
3179   emit_operand32(rbx, src);
3180 }
3181 
3182 void Assembler::fcompp() {
3183   emit_byte(0xDE);
3184   emit_byte(0xD9);
3185 }
3186 
3187 void Assembler::fcos() {
3188   emit_byte(0xD9);
3189   emit_byte(0xFF);
3190 }
3191 
3192 void Assembler::fdecstp() {
3193   emit_byte(0xD9);
3194   emit_byte(0xF6);
3195 }
3196 
3197 void Assembler::fdiv(int i) {
3198   emit_farith(0xD8, 0xF0, i);
3199 }
3200 
3201 void Assembler::fdiv_d(Address src) {
3202   InstructionMark im(this);
3203   emit_byte(0xDC);
3204   emit_operand32(rsi, src);
3205 }
3206 
3207 void Assembler::fdiv_s(Address src) {
3208   InstructionMark im(this);
3209   emit_byte(0xD8);
3210   emit_operand32(rsi, src);
3211 }
3212 
3213 void Assembler::fdiva(int i) {
3214   emit_farith(0xDC, 0xF8, i);
3215 }
3216 
3217 // Note: The Intel manual (Pentium Processor User's Manual, Vol.3, 1994)
3218 //       is erroneous for some of the floating-point instructions below.
3219 
3220 void Assembler::fdivp(int i) {
3221   emit_farith(0xDE, 0xF8, i);                    // ST(0) <- ST(0) / ST(1) and pop (Intel manual wrong)
3222 }
3223 
3224 void Assembler::fdivr(int i) {
3225   emit_farith(0xD8, 0xF8, i);
3226 }
3227 
3228 void Assembler::fdivr_d(Address src) {
3229   InstructionMark im(this);
3230   emit_byte(0xDC);
3231   emit_operand32(rdi, src);
3232 }
3233 
3234 void Assembler::fdivr_s(Address src) {
3235   InstructionMark im(this);
3236   emit_byte(0xD8);
3237   emit_operand32(rdi, src);
3238 }
3239 
3240 void Assembler::fdivra(int i) {
3241   emit_farith(0xDC, 0xF0, i);
3242 }
3243 
3244 void Assembler::fdivrp(int i) {
3245   emit_farith(0xDE, 0xF0, i);                    // ST(0) <- ST(1) / ST(0) and pop (Intel manual wrong)
3246 }
3247 
3248 void Assembler::ffree(int i) {
3249   emit_farith(0xDD, 0xC0, i);
3250 }
3251 
3252 void Assembler::fild_d(Address adr) {
3253   InstructionMark im(this);
3254   emit_byte(0xDF);
3255   emit_operand32(rbp, adr);
3256 }
3257 
3258 void Assembler::fild_s(Address adr) {
3259   InstructionMark im(this);
3260   emit_byte(0xDB);
3261   emit_operand32(rax, adr);
3262 }
3263 
3264 void Assembler::fincstp() {
3265   emit_byte(0xD9);
3266   emit_byte(0xF7);
3267 }
3268 
3269 void Assembler::finit() {
3270   emit_byte(0x9B);
3271   emit_byte(0xDB);
3272   emit_byte(0xE3);
3273 }
3274 
3275 void Assembler::fist_s(Address adr) {
3276   InstructionMark im(this);
3277   emit_byte(0xDB);
3278   emit_operand32(rdx, adr);
3279 }
3280 
3281 void Assembler::fistp_d(Address adr) {
3282   InstructionMark im(this);
3283   emit_byte(0xDF);
3284   emit_operand32(rdi, adr);
3285 }
3286 
3287 void Assembler::fistp_s(Address adr) {
3288   InstructionMark im(this);
3289   emit_byte(0xDB);
3290   emit_operand32(rbx, adr);
3291 }
3292 
3293 void Assembler::fld1() {
3294   emit_byte(0xD9);
3295   emit_byte(0xE8);
3296 }
3297 
3298 void Assembler::fld_d(Address adr) {
3299   InstructionMark im(this);
3300   emit_byte(0xDD);
3301   emit_operand32(rax, adr);
3302 }
3303 
3304 void Assembler::fld_s(Address adr) {
3305   InstructionMark im(this);
3306   emit_byte(0xD9);
3307   emit_operand32(rax, adr);
3308 }
3309 
3310 
3311 void Assembler::fld_s(int index) {
3312   emit_farith(0xD9, 0xC0, index);
3313 }
3314 
3315 void Assembler::fld_x(Address adr) {
3316   InstructionMark im(this);
3317   emit_byte(0xDB);
3318   emit_operand32(rbp, adr);
3319 }
3320 
3321 void Assembler::fldcw(Address src) {
3322   InstructionMark im(this);
3323   emit_byte(0xd9);
3324   emit_operand32(rbp, src);
3325 }
3326 
3327 void Assembler::fldenv(Address src) {
3328   InstructionMark im(this);
3329   emit_byte(0xD9);
3330   emit_operand32(rsp, src);
3331 }
3332 
3333 void Assembler::fldlg2() {
3334   emit_byte(0xD9);
3335   emit_byte(0xEC);
3336 }
3337 
3338 void Assembler::fldln2() {
3339   emit_byte(0xD9);
3340   emit_byte(0xED);
3341 }
3342 
3343 void Assembler::fldz() {
3344   emit_byte(0xD9);
3345   emit_byte(0xEE);
3346 }
3347 
3348 void Assembler::flog() {
3349   fldln2();
3350   fxch();
3351   fyl2x();
3352 }
3353 
3354 void Assembler::flog10() {
3355   fldlg2();
3356   fxch();
3357   fyl2x();
3358 }
3359 
3360 void Assembler::fmul(int i) {
3361   emit_farith(0xD8, 0xC8, i);
3362 }
3363 
3364 void Assembler::fmul_d(Address src) {
3365   InstructionMark im(this);
3366   emit_byte(0xDC);
3367   emit_operand32(rcx, src);
3368 }
3369 
3370 void Assembler::fmul_s(Address src) {
3371   InstructionMark im(this);
3372   emit_byte(0xD8);
3373   emit_operand32(rcx, src);
3374 }
3375 
3376 void Assembler::fmula(int i) {
3377   emit_farith(0xDC, 0xC8, i);
3378 }
3379 
3380 void Assembler::fmulp(int i) {
3381   emit_farith(0xDE, 0xC8, i);
3382 }
3383 
3384 void Assembler::fnsave(Address dst) {
3385   InstructionMark im(this);
3386   emit_byte(0xDD);
3387   emit_operand32(rsi, dst);
3388 }
3389 
3390 void Assembler::fnstcw(Address src) {
3391   InstructionMark im(this);
3392   emit_byte(0x9B);
3393   emit_byte(0xD9);
3394   emit_operand32(rdi, src);
3395 }
3396 
3397 void Assembler::fnstsw_ax() {
3398   emit_byte(0xdF);
3399   emit_byte(0xE0);
3400 }
3401 
3402 void Assembler::fprem() {
3403   emit_byte(0xD9);
3404   emit_byte(0xF8);
3405 }
3406 
3407 void Assembler::fprem1() {
3408   emit_byte(0xD9);
3409   emit_byte(0xF5);
3410 }
3411 
3412 void Assembler::frstor(Address src) {
3413   InstructionMark im(this);
3414   emit_byte(0xDD);
3415   emit_operand32(rsp, src);
3416 }
3417 
3418 void Assembler::fsin() {
3419   emit_byte(0xD9);
3420   emit_byte(0xFE);
3421 }
3422 
3423 void Assembler::fsqrt() {
3424   emit_byte(0xD9);
3425   emit_byte(0xFA);
3426 }
3427 
3428 void Assembler::fst_d(Address adr) {
3429   InstructionMark im(this);
3430   emit_byte(0xDD);
3431   emit_operand32(rdx, adr);
3432 }
3433 
3434 void Assembler::fst_s(Address adr) {
3435   InstructionMark im(this);
3436   emit_byte(0xD9);
3437   emit_operand32(rdx, adr);
3438 }
3439 
3440 void Assembler::fstp_d(Address adr) {
3441   InstructionMark im(this);
3442   emit_byte(0xDD);
3443   emit_operand32(rbx, adr);
3444 }
3445 
3446 void Assembler::fstp_d(int index) {
3447   emit_farith(0xDD, 0xD8, index);
3448 }
3449 
3450 void Assembler::fstp_s(Address adr) {
3451   InstructionMark im(this);
3452   emit_byte(0xD9);
3453   emit_operand32(rbx, adr);
3454 }
3455 
3456 void Assembler::fstp_x(Address adr) {
3457   InstructionMark im(this);
3458   emit_byte(0xDB);
3459   emit_operand32(rdi, adr);
3460 }
3461 
3462 void Assembler::fsub(int i) {
3463   emit_farith(0xD8, 0xE0, i);
3464 }
3465 
3466 void Assembler::fsub_d(Address src) {
3467   InstructionMark im(this);
3468   emit_byte(0xDC);
3469   emit_operand32(rsp, src);
3470 }
3471 
3472 void Assembler::fsub_s(Address src) {
3473   InstructionMark im(this);
3474   emit_byte(0xD8);
3475   emit_operand32(rsp, src);
3476 }
3477 
3478 void Assembler::fsuba(int i) {
3479   emit_farith(0xDC, 0xE8, i);
3480 }
3481 
3482 void Assembler::fsubp(int i) {
3483   emit_farith(0xDE, 0xE8, i);                    // ST(0) <- ST(0) - ST(1) and pop (Intel manual wrong)
3484 }
3485 
3486 void Assembler::fsubr(int i) {
3487   emit_farith(0xD8, 0xE8, i);
3488 }
3489 
3490 void Assembler::fsubr_d(Address src) {
3491   InstructionMark im(this);
3492   emit_byte(0xDC);
3493   emit_operand32(rbp, src);
3494 }
3495 
3496 void Assembler::fsubr_s(Address src) {
3497   InstructionMark im(this);
3498   emit_byte(0xD8);
3499   emit_operand32(rbp, src);
3500 }
3501 
3502 void Assembler::fsubra(int i) {
3503   emit_farith(0xDC, 0xE0, i);
3504 }
3505 
3506 void Assembler::fsubrp(int i) {
3507   emit_farith(0xDE, 0xE0, i);                    // ST(0) <- ST(1) - ST(0) and pop (Intel manual wrong)
3508 }
3509 
3510 void Assembler::ftan() {
3511   emit_byte(0xD9);
3512   emit_byte(0xF2);
3513   emit_byte(0xDD);
3514   emit_byte(0xD8);
3515 }
3516 
3517 void Assembler::ftst() {
3518   emit_byte(0xD9);
3519   emit_byte(0xE4);
3520 }
3521 
3522 void Assembler::fucomi(int i) {
3523   // make sure the instruction is supported (introduced for P6, together with cmov)
3524   guarantee(VM_Version::supports_cmov(), "illegal instruction");
3525   emit_farith(0xDB, 0xE8, i);
3526 }
3527 
3528 void Assembler::fucomip(int i) {
3529   // make sure the instruction is supported (introduced for P6, together with cmov)
3530   guarantee(VM_Version::supports_cmov(), "illegal instruction");
3531   emit_farith(0xDF, 0xE8, i);
3532 }
3533 
3534 void Assembler::fwait() {
3535   emit_byte(0x9B);
3536 }
3537 
3538 void Assembler::fxch(int i) {
3539   emit_farith(0xD9, 0xC8, i);
3540 }
3541 
3542 void Assembler::fyl2x() {
3543   emit_byte(0xD9);
3544   emit_byte(0xF1);
3545 }
3546 
3547 // SSE SIMD prefix byte values corresponding to VexSimdPrefix encoding.
3548 static int simd_pre[4] = { 0, 0x66, 0xF3, 0xF2 };
3549 // SSE opcode second byte values (first is 0x0F) corresponding to VexOpcode encoding.
3550 static int simd_opc[4] = { 0,    0, 0x38, 0x3A };
3551 
3552 // Generate SSE legacy REX prefix and SIMD opcode based on VEX encoding.
3553 void Assembler::rex_prefix(Address adr, XMMRegister xreg, VexSimdPrefix pre, VexOpcode opc, bool rex_w) {
3554   if (pre > 0) {
3555     emit_byte(simd_pre[pre]);
3556   }
3557   if (rex_w) {
3558     prefixq(adr, xreg);
3559   } else {
3560     prefix(adr, xreg);
3561   }
3562   if (opc > 0) {
3563     emit_byte(0x0F);
3564     int opc2 = simd_opc[opc];
3565     if (opc2 > 0) {
3566       emit_byte(opc2);
3567     }
3568   }
3569 }
3570 
3571 int Assembler::rex_prefix_and_encode(int dst_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc, bool rex_w) {
3572   if (pre > 0) {
3573     emit_byte(simd_pre[pre]);
3574   }
3575   int encode = (rex_w) ? prefixq_and_encode(dst_enc, src_enc) :
3576                           prefix_and_encode(dst_enc, src_enc);
3577   if (opc > 0) {
3578     emit_byte(0x0F);
3579     int opc2 = simd_opc[opc];
3580     if (opc2 > 0) {
3581       emit_byte(opc2);
3582     }
3583   }
3584   return encode;
3585 }
3586 
3587 
3588 void Assembler::vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w, int nds_enc, VexSimdPrefix pre, VexOpcode opc, bool vector256) {
3589   if (vex_b || vex_x || vex_w || (opc == VEX_OPCODE_0F_38) || (opc == VEX_OPCODE_0F_3A)) {
3590     prefix(VEX_3bytes);
3591 
3592     int byte1 = (vex_r ? VEX_R : 0) | (vex_x ? VEX_X : 0) | (vex_b ? VEX_B : 0);
3593     byte1 = (~byte1) & 0xE0;
3594     byte1 |= opc;
3595     a_byte(byte1);
3596 
3597     int byte2 = ((~nds_enc) & 0xf) << 3;
3598     byte2 |= (vex_w ? VEX_W : 0) | (vector256 ? 4 : 0) | pre;
3599     emit_byte(byte2);
3600   } else {
3601     prefix(VEX_2bytes);
3602 
3603     int byte1 = vex_r ? VEX_R : 0;
3604     byte1 = (~byte1) & 0x80;
3605     byte1 |= ((~nds_enc) & 0xf) << 3;
3606     byte1 |= (vector256 ? 4 : 0) | pre;
3607     emit_byte(byte1);
3608   }
3609 }
3610 
3611 void Assembler::vex_prefix(Address adr, int nds_enc, int xreg_enc, VexSimdPrefix pre, VexOpcode opc, bool vex_w, bool vector256){
3612   bool vex_r = (xreg_enc >= 8);
3613   bool vex_b = adr.base_needs_rex();
3614   bool vex_x = adr.index_needs_rex();
3615   vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector256);
3616 }
3617 
3618 int Assembler::vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc, bool vex_w, bool vector256) {
3619   bool vex_r = (dst_enc >= 8);
3620   bool vex_b = (src_enc >= 8);
3621   bool vex_x = false;
3622   vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector256);
3623   return (((dst_enc & 7) << 3) | (src_enc & 7));
3624 }
3625 
3626 
3627 void Assembler::simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr, VexSimdPrefix pre, VexOpcode opc, bool rex_w, bool vector256) {
3628   if (UseAVX > 0) {
3629     int xreg_enc = xreg->encoding();
3630     int  nds_enc = nds->is_valid() ? nds->encoding() : 0;
3631     vex_prefix(adr, nds_enc, xreg_enc, pre, opc, rex_w, vector256);
3632   } else {
3633     assert((nds == xreg) || (nds == xnoreg), "wrong sse encoding");
3634     rex_prefix(adr, xreg, pre, opc, rex_w);
3635   }
3636 }
3637 
3638 int Assembler::simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, VexSimdPrefix pre, VexOpcode opc, bool rex_w, bool vector256) {
3639   int dst_enc = dst->encoding();
3640   int src_enc = src->encoding();
3641   if (UseAVX > 0) {
3642     int nds_enc = nds->is_valid() ? nds->encoding() : 0;
3643     return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, rex_w, vector256);
3644   } else {
3645     assert((nds == dst) || (nds == src) || (nds == xnoreg), "wrong sse encoding");
3646     return rex_prefix_and_encode(dst_enc, src_enc, pre, opc, rex_w);
3647   }
3648 }
3649 
3650 #ifndef _LP64
3651 
3652 void Assembler::incl(Register dst) {
3653   // Don't use it directly. Use MacroAssembler::incrementl() instead.
3654   emit_byte(0x40 | dst->encoding());
3655 }
3656 
3657 void Assembler::lea(Register dst, Address src) {
3658   leal(dst, src);
3659 }
3660 
3661 void Assembler::mov_literal32(Address dst, int32_t imm32,  RelocationHolder const& rspec) {
3662   InstructionMark im(this);
3663   emit_byte(0xC7);
3664   emit_operand(rax, dst);
3665   emit_data((int)imm32, rspec, 0);
3666 }
3667 
3668 void Assembler::mov_literal32(Register dst, int32_t imm32, RelocationHolder const& rspec) {
3669   InstructionMark im(this);
3670   int encode = prefix_and_encode(dst->encoding());
3671   emit_byte(0xB8 | encode);
3672   emit_data((int)imm32, rspec, 0);
3673 }
3674 
3675 void Assembler::popa() { // 32bit
3676   emit_byte(0x61);
3677 }
3678 
3679 void Assembler::push_literal32(int32_t imm32, RelocationHolder const& rspec) {
3680   InstructionMark im(this);
3681   emit_byte(0x68);
3682   emit_data(imm32, rspec, 0);
3683 }
3684 
3685 void Assembler::pusha() { // 32bit
3686   emit_byte(0x60);
3687 }
3688 
3689 void Assembler::set_byte_if_not_zero(Register dst) {
3690   emit_byte(0x0F);
3691   emit_byte(0x95);
3692   emit_byte(0xE0 | dst->encoding());
3693 }
3694 
3695 void Assembler::shldl(Register dst, Register src) {
3696   emit_byte(0x0F);
3697   emit_byte(0xA5);
3698   emit_byte(0xC0 | src->encoding() << 3 | dst->encoding());
3699 }
3700 
3701 void Assembler::shrdl(Register dst, Register src) {
3702   emit_byte(0x0F);
3703   emit_byte(0xAD);
3704   emit_byte(0xC0 | src->encoding() << 3 | dst->encoding());
3705 }
3706 
3707 #else // LP64
3708 
3709 void Assembler::set_byte_if_not_zero(Register dst) {
3710   int enc = prefix_and_encode(dst->encoding(), true);
3711   emit_byte(0x0F);
3712   emit_byte(0x95);
3713   emit_byte(0xE0 | enc);
3714 }
3715 
3716 // 64bit only pieces of the assembler
3717 // This should only be used by 64bit instructions that can use rip-relative
3718 // it cannot be used by instructions that want an immediate value.
3719 
3720 bool Assembler::reachable(AddressLiteral adr) {
3721   int64_t disp;
3722   // None will force a 64bit literal to the code stream. Likely a placeholder
3723   // for something that will be patched later and we need to certain it will
3724   // always be reachable.
3725   if (adr.reloc() == relocInfo::none) {
3726     return false;
3727   }
3728   if (adr.reloc() == relocInfo::internal_word_type) {
3729     // This should be rip relative and easily reachable.
3730     return true;
3731   }
3732   if (adr.reloc() == relocInfo::virtual_call_type ||
3733       adr.reloc() == relocInfo::opt_virtual_call_type ||
3734       adr.reloc() == relocInfo::static_call_type ||
3735       adr.reloc() == relocInfo::static_stub_type ) {
3736     // This should be rip relative within the code cache and easily
3737     // reachable until we get huge code caches. (At which point
3738     // ic code is going to have issues).
3739     return true;
3740   }
3741   if (adr.reloc() != relocInfo::external_word_type &&
3742       adr.reloc() != relocInfo::poll_return_type &&  // these are really external_word but need special
3743       adr.reloc() != relocInfo::poll_type &&         // relocs to identify them
3744       adr.reloc() != relocInfo::runtime_call_type ) {
3745     return false;
3746   }
3747 
3748   // Stress the correction code
3749   if (ForceUnreachable) {
3750     // Must be runtimecall reloc, see if it is in the codecache
3751     // Flipping stuff in the codecache to be unreachable causes issues
3752     // with things like inline caches where the additional instructions
3753     // are not handled.
3754     if (CodeCache::find_blob(adr._target) == NULL) {
3755       return false;
3756     }
3757   }
3758   // For external_word_type/runtime_call_type if it is reachable from where we
3759   // are now (possibly a temp buffer) and where we might end up
3760   // anywhere in the codeCache then we are always reachable.
3761   // This would have to change if we ever save/restore shared code
3762   // to be more pessimistic.
3763   disp = (int64_t)adr._target - ((int64_t)CodeCache::low_bound() + sizeof(int));
3764   if (!is_simm32(disp)) return false;
3765   disp = (int64_t)adr._target - ((int64_t)CodeCache::high_bound() + sizeof(int));
3766   if (!is_simm32(disp)) return false;
3767 
3768   disp = (int64_t)adr._target - ((int64_t)_code_pos + sizeof(int));
3769 
3770   // Because rip relative is a disp + address_of_next_instruction and we
3771   // don't know the value of address_of_next_instruction we apply a fudge factor
3772   // to make sure we will be ok no matter the size of the instruction we get placed into.
3773   // We don't have to fudge the checks above here because they are already worst case.
3774 
3775   // 12 == override/rex byte, opcode byte, rm byte, sib byte, a 4-byte disp , 4-byte literal
3776   // + 4 because better safe than sorry.
3777   const int fudge = 12 + 4;
3778   if (disp < 0) {
3779     disp -= fudge;
3780   } else {
3781     disp += fudge;
3782   }
3783   return is_simm32(disp);
3784 }
3785 
3786 // Check if the polling page is not reachable from the code cache using rip-relative
3787 // addressing.
3788 bool Assembler::is_polling_page_far() {
3789   intptr_t addr = (intptr_t)os::get_polling_page();
3790   return ForceUnreachable ||
3791          !is_simm32(addr - (intptr_t)CodeCache::low_bound()) ||
3792          !is_simm32(addr - (intptr_t)CodeCache::high_bound());
3793 }
3794 
3795 void Assembler::emit_data64(jlong data,
3796                             relocInfo::relocType rtype,
3797                             int format) {
3798   if (rtype == relocInfo::none) {
3799     emit_long64(data);
3800   } else {
3801     emit_data64(data, Relocation::spec_simple(rtype), format);
3802   }
3803 }
3804 
3805 void Assembler::emit_data64(jlong data,
3806                             RelocationHolder const& rspec,
3807                             int format) {
3808   assert(imm_operand == 0, "default format must be immediate in this file");
3809   assert(imm_operand == format, "must be immediate");
3810   assert(inst_mark() != NULL, "must be inside InstructionMark");
3811   // Do not use AbstractAssembler::relocate, which is not intended for
3812   // embedded words.  Instead, relocate to the enclosing instruction.
3813   code_section()->relocate(inst_mark(), rspec, format);
3814 #ifdef ASSERT
3815   check_relocation(rspec, format);
3816 #endif
3817   emit_long64(data);
3818 }
3819 
3820 int Assembler::prefix_and_encode(int reg_enc, bool byteinst) {
3821   if (reg_enc >= 8) {
3822     prefix(REX_B);
3823     reg_enc -= 8;
3824   } else if (byteinst && reg_enc >= 4) {
3825     prefix(REX);
3826   }
3827   return reg_enc;
3828 }
3829 
3830 int Assembler::prefixq_and_encode(int reg_enc) {
3831   if (reg_enc < 8) {
3832     prefix(REX_W);
3833   } else {
3834     prefix(REX_WB);
3835     reg_enc -= 8;
3836   }
3837   return reg_enc;
3838 }
3839 
3840 int Assembler::prefix_and_encode(int dst_enc, int src_enc, bool byteinst) {
3841   if (dst_enc < 8) {
3842     if (src_enc >= 8) {
3843       prefix(REX_B);
3844       src_enc -= 8;
3845     } else if (byteinst && src_enc >= 4) {
3846       prefix(REX);
3847     }
3848   } else {
3849     if (src_enc < 8) {
3850       prefix(REX_R);
3851     } else {
3852       prefix(REX_RB);
3853       src_enc -= 8;
3854     }
3855     dst_enc -= 8;
3856   }
3857   return dst_enc << 3 | src_enc;
3858 }
3859 
3860 int Assembler::prefixq_and_encode(int dst_enc, int src_enc) {
3861   if (dst_enc < 8) {
3862     if (src_enc < 8) {
3863       prefix(REX_W);
3864     } else {
3865       prefix(REX_WB);
3866       src_enc -= 8;
3867     }
3868   } else {
3869     if (src_enc < 8) {
3870       prefix(REX_WR);
3871     } else {
3872       prefix(REX_WRB);
3873       src_enc -= 8;
3874     }
3875     dst_enc -= 8;
3876   }
3877   return dst_enc << 3 | src_enc;
3878 }
3879 
3880 void Assembler::prefix(Register reg) {
3881   if (reg->encoding() >= 8) {
3882     prefix(REX_B);
3883   }
3884 }
3885 
3886 void Assembler::prefix(Address adr) {
3887   if (adr.base_needs_rex()) {
3888     if (adr.index_needs_rex()) {
3889       prefix(REX_XB);
3890     } else {
3891       prefix(REX_B);
3892     }
3893   } else {
3894     if (adr.index_needs_rex()) {
3895       prefix(REX_X);
3896     }
3897   }
3898 }
3899 
3900 void Assembler::prefixq(Address adr) {
3901   if (adr.base_needs_rex()) {
3902     if (adr.index_needs_rex()) {
3903       prefix(REX_WXB);
3904     } else {
3905       prefix(REX_WB);
3906     }
3907   } else {
3908     if (adr.index_needs_rex()) {
3909       prefix(REX_WX);
3910     } else {
3911       prefix(REX_W);
3912     }
3913   }
3914 }
3915 
3916 
3917 void Assembler::prefix(Address adr, Register reg, bool byteinst) {
3918   if (reg->encoding() < 8) {
3919     if (adr.base_needs_rex()) {
3920       if (adr.index_needs_rex()) {
3921         prefix(REX_XB);
3922       } else {
3923         prefix(REX_B);
3924       }
3925     } else {
3926       if (adr.index_needs_rex()) {
3927         prefix(REX_X);
3928       } else if (byteinst && reg->encoding() >= 4 ) {
3929         prefix(REX);
3930       }
3931     }
3932   } else {
3933     if (adr.base_needs_rex()) {
3934       if (adr.index_needs_rex()) {
3935         prefix(REX_RXB);
3936       } else {
3937         prefix(REX_RB);
3938       }
3939     } else {
3940       if (adr.index_needs_rex()) {
3941         prefix(REX_RX);
3942       } else {
3943         prefix(REX_R);
3944       }
3945     }
3946   }
3947 }
3948 
3949 void Assembler::prefixq(Address adr, Register src) {
3950   if (src->encoding() < 8) {
3951     if (adr.base_needs_rex()) {
3952       if (adr.index_needs_rex()) {
3953         prefix(REX_WXB);
3954       } else {
3955         prefix(REX_WB);
3956       }
3957     } else {
3958       if (adr.index_needs_rex()) {
3959         prefix(REX_WX);
3960       } else {
3961         prefix(REX_W);
3962       }
3963     }
3964   } else {
3965     if (adr.base_needs_rex()) {
3966       if (adr.index_needs_rex()) {
3967         prefix(REX_WRXB);
3968       } else {
3969         prefix(REX_WRB);
3970       }
3971     } else {
3972       if (adr.index_needs_rex()) {
3973         prefix(REX_WRX);
3974       } else {
3975         prefix(REX_WR);
3976       }
3977     }
3978   }
3979 }
3980 
3981 void Assembler::prefix(Address adr, XMMRegister reg) {
3982   if (reg->encoding() < 8) {
3983     if (adr.base_needs_rex()) {
3984       if (adr.index_needs_rex()) {
3985         prefix(REX_XB);
3986       } else {
3987         prefix(REX_B);
3988       }
3989     } else {
3990       if (adr.index_needs_rex()) {
3991         prefix(REX_X);
3992       }
3993     }
3994   } else {
3995     if (adr.base_needs_rex()) {
3996       if (adr.index_needs_rex()) {
3997         prefix(REX_RXB);
3998       } else {
3999         prefix(REX_RB);
4000       }
4001     } else {
4002       if (adr.index_needs_rex()) {
4003         prefix(REX_RX);
4004       } else {
4005         prefix(REX_R);
4006       }
4007     }
4008   }
4009 }
4010 
4011 void Assembler::prefixq(Address adr, XMMRegister src) {
4012   if (src->encoding() < 8) {
4013     if (adr.base_needs_rex()) {
4014       if (adr.index_needs_rex()) {
4015         prefix(REX_WXB);
4016       } else {
4017         prefix(REX_WB);
4018       }
4019     } else {
4020       if (adr.index_needs_rex()) {
4021         prefix(REX_WX);
4022       } else {
4023         prefix(REX_W);
4024       }
4025     }
4026   } else {
4027     if (adr.base_needs_rex()) {
4028       if (adr.index_needs_rex()) {
4029         prefix(REX_WRXB);
4030       } else {
4031         prefix(REX_WRB);
4032       }
4033     } else {
4034       if (adr.index_needs_rex()) {
4035         prefix(REX_WRX);
4036       } else {
4037         prefix(REX_WR);
4038       }
4039     }
4040   }
4041 }
4042 
4043 void Assembler::adcq(Register dst, int32_t imm32) {
4044   (void) prefixq_and_encode(dst->encoding());
4045   emit_arith(0x81, 0xD0, dst, imm32);
4046 }
4047 
4048 void Assembler::adcq(Register dst, Address src) {
4049   InstructionMark im(this);
4050   prefixq(src, dst);
4051   emit_byte(0x13);
4052   emit_operand(dst, src);
4053 }
4054 
4055 void Assembler::adcq(Register dst, Register src) {
4056   (int) prefixq_and_encode(dst->encoding(), src->encoding());
4057   emit_arith(0x13, 0xC0, dst, src);
4058 }
4059 
4060 void Assembler::addq(Address dst, int32_t imm32) {
4061   InstructionMark im(this);
4062   prefixq(dst);
4063   emit_arith_operand(0x81, rax, dst,imm32);
4064 }
4065 
4066 void Assembler::addq(Address dst, Register src) {
4067   InstructionMark im(this);
4068   prefixq(dst, src);
4069   emit_byte(0x01);
4070   emit_operand(src, dst);
4071 }
4072 
4073 void Assembler::addq(Register dst, int32_t imm32) {
4074   (void) prefixq_and_encode(dst->encoding());
4075   emit_arith(0x81, 0xC0, dst, imm32);
4076 }
4077 
4078 void Assembler::addq(Register dst, Address src) {
4079   InstructionMark im(this);
4080   prefixq(src, dst);
4081   emit_byte(0x03);
4082   emit_operand(dst, src);
4083 }
4084 
4085 void Assembler::addq(Register dst, Register src) {
4086   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4087   emit_arith(0x03, 0xC0, dst, src);
4088 }
4089 
4090 void Assembler::andq(Address dst, int32_t imm32) {
4091   InstructionMark im(this);
4092   prefixq(dst);
4093   emit_byte(0x81);
4094   emit_operand(rsp, dst, 4);
4095   emit_long(imm32);
4096 }
4097 
4098 void Assembler::andq(Register dst, int32_t imm32) {
4099   (void) prefixq_and_encode(dst->encoding());
4100   emit_arith(0x81, 0xE0, dst, imm32);
4101 }
4102 
4103 void Assembler::andq(Register dst, Address src) {
4104   InstructionMark im(this);
4105   prefixq(src, dst);
4106   emit_byte(0x23);
4107   emit_operand(dst, src);
4108 }
4109 
4110 void Assembler::andq(Register dst, Register src) {
4111   (int) prefixq_and_encode(dst->encoding(), src->encoding());
4112   emit_arith(0x23, 0xC0, dst, src);
4113 }
4114 
4115 void Assembler::bsfq(Register dst, Register src) {
4116   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4117   emit_byte(0x0F);
4118   emit_byte(0xBC);
4119   emit_byte(0xC0 | encode);
4120 }
4121 
4122 void Assembler::bsrq(Register dst, Register src) {
4123   assert(!VM_Version::supports_lzcnt(), "encoding is treated as LZCNT");
4124   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4125   emit_byte(0x0F);
4126   emit_byte(0xBD);
4127   emit_byte(0xC0 | encode);
4128 }
4129 
4130 void Assembler::bswapq(Register reg) {
4131   int encode = prefixq_and_encode(reg->encoding());
4132   emit_byte(0x0F);
4133   emit_byte(0xC8 | encode);
4134 }
4135 
4136 void Assembler::cdqq() {
4137   prefix(REX_W);
4138   emit_byte(0x99);
4139 }
4140 
4141 void Assembler::clflush(Address adr) {
4142   prefix(adr);
4143   emit_byte(0x0F);
4144   emit_byte(0xAE);
4145   emit_operand(rdi, adr);
4146 }
4147 
4148 void Assembler::cmovq(Condition cc, Register dst, Register src) {
4149   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4150   emit_byte(0x0F);
4151   emit_byte(0x40 | cc);
4152   emit_byte(0xC0 | encode);
4153 }
4154 
4155 void Assembler::cmovq(Condition cc, Register dst, Address src) {
4156   InstructionMark im(this);
4157   prefixq(src, dst);
4158   emit_byte(0x0F);
4159   emit_byte(0x40 | cc);
4160   emit_operand(dst, src);
4161 }
4162 
4163 void Assembler::cmpq(Address dst, int32_t imm32) {
4164   InstructionMark im(this);
4165   prefixq(dst);
4166   emit_byte(0x81);
4167   emit_operand(rdi, dst, 4);
4168   emit_long(imm32);
4169 }
4170 
4171 void Assembler::cmpq(Register dst, int32_t imm32) {
4172   (void) prefixq_and_encode(dst->encoding());
4173   emit_arith(0x81, 0xF8, dst, imm32);
4174 }
4175 
4176 void Assembler::cmpq(Address dst, Register src) {
4177   InstructionMark im(this);
4178   prefixq(dst, src);
4179   emit_byte(0x3B);
4180   emit_operand(src, dst);
4181 }
4182 
4183 void Assembler::cmpq(Register dst, Register src) {
4184   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4185   emit_arith(0x3B, 0xC0, dst, src);
4186 }
4187 
4188 void Assembler::cmpq(Register dst, Address  src) {
4189   InstructionMark im(this);
4190   prefixq(src, dst);
4191   emit_byte(0x3B);
4192   emit_operand(dst, src);
4193 }
4194 
4195 void Assembler::cmpxchgq(Register reg, Address adr) {
4196   InstructionMark im(this);
4197   prefixq(adr, reg);
4198   emit_byte(0x0F);
4199   emit_byte(0xB1);
4200   emit_operand(reg, adr);
4201 }
4202 
4203 void Assembler::cvtsi2sdq(XMMRegister dst, Register src) {
4204   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4205   int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F2);
4206   emit_byte(0x2A);
4207   emit_byte(0xC0 | encode);
4208 }
4209 
4210 void Assembler::cvtsi2sdq(XMMRegister dst, Address src) {
4211   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4212   InstructionMark im(this);
4213   simd_prefix_q(dst, dst, src, VEX_SIMD_F2);
4214   emit_byte(0x2A);
4215   emit_operand(dst, src);
4216 }
4217 
4218 void Assembler::cvtsi2ssq(XMMRegister dst, Register src) {
4219   NOT_LP64(assert(VM_Version::supports_sse(), ""));
4220   int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3);
4221   emit_byte(0x2A);
4222   emit_byte(0xC0 | encode);
4223 }
4224 
4225 void Assembler::cvtsi2ssq(XMMRegister dst, Address src) {
4226   NOT_LP64(assert(VM_Version::supports_sse(), ""));
4227   InstructionMark im(this);
4228   simd_prefix_q(dst, dst, src, VEX_SIMD_F3);
4229   emit_byte(0x2A);
4230   emit_operand(dst, src);
4231 }
4232 
4233 void Assembler::cvttsd2siq(Register dst, XMMRegister src) {
4234   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4235   int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F2);
4236   emit_byte(0x2C);
4237   emit_byte(0xC0 | encode);
4238 }
4239 
4240 void Assembler::cvttss2siq(Register dst, XMMRegister src) {
4241   NOT_LP64(assert(VM_Version::supports_sse(), ""));
4242   int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F3);
4243   emit_byte(0x2C);
4244   emit_byte(0xC0 | encode);
4245 }
4246 
4247 void Assembler::decl(Register dst) {
4248   // Don't use it directly. Use MacroAssembler::decrementl() instead.
4249   // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
4250   int encode = prefix_and_encode(dst->encoding());
4251   emit_byte(0xFF);
4252   emit_byte(0xC8 | encode);
4253 }
4254 
4255 void Assembler::decq(Register dst) {
4256   // Don't use it directly. Use MacroAssembler::decrementq() instead.
4257   // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
4258   int encode = prefixq_and_encode(dst->encoding());
4259   emit_byte(0xFF);
4260   emit_byte(0xC8 | encode);
4261 }
4262 
4263 void Assembler::decq(Address dst) {
4264   // Don't use it directly. Use MacroAssembler::decrementq() instead.
4265   InstructionMark im(this);
4266   prefixq(dst);
4267   emit_byte(0xFF);
4268   emit_operand(rcx, dst);
4269 }
4270 
4271 void Assembler::fxrstor(Address src) {
4272   prefixq(src);
4273   emit_byte(0x0F);
4274   emit_byte(0xAE);
4275   emit_operand(as_Register(1), src);
4276 }
4277 
4278 void Assembler::fxsave(Address dst) {
4279   prefixq(dst);
4280   emit_byte(0x0F);
4281   emit_byte(0xAE);
4282   emit_operand(as_Register(0), dst);
4283 }
4284 
4285 void Assembler::idivq(Register src) {
4286   int encode = prefixq_and_encode(src->encoding());
4287   emit_byte(0xF7);
4288   emit_byte(0xF8 | encode);
4289 }
4290 
4291 void Assembler::imulq(Register dst, Register src) {
4292   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4293   emit_byte(0x0F);
4294   emit_byte(0xAF);
4295   emit_byte(0xC0 | encode);
4296 }
4297 
4298 void Assembler::imulq(Register dst, Register src, int value) {
4299   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4300   if (is8bit(value)) {
4301     emit_byte(0x6B);
4302     emit_byte(0xC0 | encode);
4303     emit_byte(value & 0xFF);
4304   } else {
4305     emit_byte(0x69);
4306     emit_byte(0xC0 | encode);
4307     emit_long(value);
4308   }
4309 }
4310 
4311 void Assembler::incl(Register dst) {
4312   // Don't use it directly. Use MacroAssembler::incrementl() instead.
4313   // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
4314   int encode = prefix_and_encode(dst->encoding());
4315   emit_byte(0xFF);
4316   emit_byte(0xC0 | encode);
4317 }
4318 
4319 void Assembler::incq(Register dst) {
4320   // Don't use it directly. Use MacroAssembler::incrementq() instead.
4321   // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
4322   int encode = prefixq_and_encode(dst->encoding());
4323   emit_byte(0xFF);
4324   emit_byte(0xC0 | encode);
4325 }
4326 
4327 void Assembler::incq(Address dst) {
4328   // Don't use it directly. Use MacroAssembler::incrementq() instead.
4329   InstructionMark im(this);
4330   prefixq(dst);
4331   emit_byte(0xFF);
4332   emit_operand(rax, dst);
4333 }
4334 
4335 void Assembler::lea(Register dst, Address src) {
4336   leaq(dst, src);
4337 }
4338 
4339 void Assembler::leaq(Register dst, Address src) {
4340   InstructionMark im(this);
4341   prefixq(src, dst);
4342   emit_byte(0x8D);
4343   emit_operand(dst, src);
4344 }
4345 
4346 void Assembler::mov64(Register dst, int64_t imm64) {
4347   InstructionMark im(this);
4348   int encode = prefixq_and_encode(dst->encoding());
4349   emit_byte(0xB8 | encode);
4350   emit_long64(imm64);
4351 }
4352 
4353 void Assembler::mov_literal64(Register dst, intptr_t imm64, RelocationHolder const& rspec) {
4354   InstructionMark im(this);
4355   int encode = prefixq_and_encode(dst->encoding());
4356   emit_byte(0xB8 | encode);
4357   emit_data64(imm64, rspec);
4358 }
4359 
4360 void Assembler::mov_narrow_oop(Register dst, int32_t imm32, RelocationHolder const& rspec) {
4361   InstructionMark im(this);
4362   int encode = prefix_and_encode(dst->encoding());
4363   emit_byte(0xB8 | encode);
4364   emit_data((int)imm32, rspec, narrow_oop_operand);
4365 }
4366 
4367 void Assembler::mov_narrow_oop(Address dst, int32_t imm32,  RelocationHolder const& rspec) {
4368   InstructionMark im(this);
4369   prefix(dst);
4370   emit_byte(0xC7);
4371   emit_operand(rax, dst, 4);
4372   emit_data((int)imm32, rspec, narrow_oop_operand);
4373 }
4374 
4375 void Assembler::cmp_narrow_oop(Register src1, int32_t imm32, RelocationHolder const& rspec) {
4376   InstructionMark im(this);
4377   int encode = prefix_and_encode(src1->encoding());
4378   emit_byte(0x81);
4379   emit_byte(0xF8 | encode);
4380   emit_data((int)imm32, rspec, narrow_oop_operand);
4381 }
4382 
4383 void Assembler::cmp_narrow_oop(Address src1, int32_t imm32, RelocationHolder const& rspec) {
4384   InstructionMark im(this);
4385   prefix(src1);
4386   emit_byte(0x81);
4387   emit_operand(rax, src1, 4);
4388   emit_data((int)imm32, rspec, narrow_oop_operand);
4389 }
4390 
4391 void Assembler::lzcntq(Register dst, Register src) {
4392   assert(VM_Version::supports_lzcnt(), "encoding is treated as BSR");
4393   emit_byte(0xF3);
4394   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4395   emit_byte(0x0F);
4396   emit_byte(0xBD);
4397   emit_byte(0xC0 | encode);
4398 }
4399 
4400 void Assembler::movdq(XMMRegister dst, Register src) {
4401   // table D-1 says MMX/SSE2
4402   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4403   int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_66);
4404   emit_byte(0x6E);
4405   emit_byte(0xC0 | encode);
4406 }
4407 
4408 void Assembler::movdq(Register dst, XMMRegister src) {
4409   // table D-1 says MMX/SSE2
4410   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4411   // swap src/dst to get correct prefix
4412   int encode = simd_prefix_and_encode_q(src, dst, VEX_SIMD_66);
4413   emit_byte(0x7E);
4414   emit_byte(0xC0 | encode);
4415 }
4416 
4417 void Assembler::movq(Register dst, Register src) {
4418   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4419   emit_byte(0x8B);
4420   emit_byte(0xC0 | encode);
4421 }
4422 
4423 void Assembler::movq(Register dst, Address src) {
4424   InstructionMark im(this);
4425   prefixq(src, dst);
4426   emit_byte(0x8B);
4427   emit_operand(dst, src);
4428 }
4429 
4430 void Assembler::movq(Address dst, Register src) {
4431   InstructionMark im(this);
4432   prefixq(dst, src);
4433   emit_byte(0x89);
4434   emit_operand(src, dst);
4435 }
4436 
4437 void Assembler::movsbq(Register dst, Address src) {
4438   InstructionMark im(this);
4439   prefixq(src, dst);
4440   emit_byte(0x0F);
4441   emit_byte(0xBE);
4442   emit_operand(dst, src);
4443 }
4444 
4445 void Assembler::movsbq(Register dst, Register src) {
4446   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4447   emit_byte(0x0F);
4448   emit_byte(0xBE);
4449   emit_byte(0xC0 | encode);
4450 }
4451 
4452 void Assembler::movslq(Register dst, int32_t imm32) {
4453   // dbx shows movslq(rcx, 3) as movq     $0x0000000049000000,(%rbx)
4454   // and movslq(r8, 3); as movl     $0x0000000048000000,(%rbx)
4455   // as a result we shouldn't use until tested at runtime...
4456   ShouldNotReachHere();
4457   InstructionMark im(this);
4458   int encode = prefixq_and_encode(dst->encoding());
4459   emit_byte(0xC7 | encode);
4460   emit_long(imm32);
4461 }
4462 
4463 void Assembler::movslq(Address dst, int32_t imm32) {
4464   assert(is_simm32(imm32), "lost bits");
4465   InstructionMark im(this);
4466   prefixq(dst);
4467   emit_byte(0xC7);
4468   emit_operand(rax, dst, 4);
4469   emit_long(imm32);
4470 }
4471 
4472 void Assembler::movslq(Register dst, Address src) {
4473   InstructionMark im(this);
4474   prefixq(src, dst);
4475   emit_byte(0x63);
4476   emit_operand(dst, src);
4477 }
4478 
4479 void Assembler::movslq(Register dst, Register src) {
4480   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4481   emit_byte(0x63);
4482   emit_byte(0xC0 | encode);
4483 }
4484 
4485 void Assembler::movswq(Register dst, Address src) {
4486   InstructionMark im(this);
4487   prefixq(src, dst);
4488   emit_byte(0x0F);
4489   emit_byte(0xBF);
4490   emit_operand(dst, src);
4491 }
4492 
4493 void Assembler::movswq(Register dst, Register src) {
4494   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4495   emit_byte(0x0F);
4496   emit_byte(0xBF);
4497   emit_byte(0xC0 | encode);
4498 }
4499 
4500 void Assembler::movzbq(Register dst, Address src) {
4501   InstructionMark im(this);
4502   prefixq(src, dst);
4503   emit_byte(0x0F);
4504   emit_byte(0xB6);
4505   emit_operand(dst, src);
4506 }
4507 
4508 void Assembler::movzbq(Register dst, Register src) {
4509   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4510   emit_byte(0x0F);
4511   emit_byte(0xB6);
4512   emit_byte(0xC0 | encode);
4513 }
4514 
4515 void Assembler::movzwq(Register dst, Address src) {
4516   InstructionMark im(this);
4517   prefixq(src, dst);
4518   emit_byte(0x0F);
4519   emit_byte(0xB7);
4520   emit_operand(dst, src);
4521 }
4522 
4523 void Assembler::movzwq(Register dst, Register src) {
4524   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4525   emit_byte(0x0F);
4526   emit_byte(0xB7);
4527   emit_byte(0xC0 | encode);
4528 }
4529 
4530 void Assembler::negq(Register dst) {
4531   int encode = prefixq_and_encode(dst->encoding());
4532   emit_byte(0xF7);
4533   emit_byte(0xD8 | encode);
4534 }
4535 
4536 void Assembler::notq(Register dst) {
4537   int encode = prefixq_and_encode(dst->encoding());
4538   emit_byte(0xF7);
4539   emit_byte(0xD0 | encode);
4540 }
4541 
4542 void Assembler::orq(Address dst, int32_t imm32) {
4543   InstructionMark im(this);
4544   prefixq(dst);
4545   emit_byte(0x81);
4546   emit_operand(rcx, dst, 4);
4547   emit_long(imm32);
4548 }
4549 
4550 void Assembler::orq(Register dst, int32_t imm32) {
4551   (void) prefixq_and_encode(dst->encoding());
4552   emit_arith(0x81, 0xC8, dst, imm32);
4553 }
4554 
4555 void Assembler::orq(Register dst, Address src) {
4556   InstructionMark im(this);
4557   prefixq(src, dst);
4558   emit_byte(0x0B);
4559   emit_operand(dst, src);
4560 }
4561 
4562 void Assembler::orq(Register dst, Register src) {
4563   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4564   emit_arith(0x0B, 0xC0, dst, src);
4565 }
4566 
4567 void Assembler::popa() { // 64bit
4568   movq(r15, Address(rsp, 0));
4569   movq(r14, Address(rsp, wordSize));
4570   movq(r13, Address(rsp, 2 * wordSize));
4571   movq(r12, Address(rsp, 3 * wordSize));
4572   movq(r11, Address(rsp, 4 * wordSize));
4573   movq(r10, Address(rsp, 5 * wordSize));
4574   movq(r9,  Address(rsp, 6 * wordSize));
4575   movq(r8,  Address(rsp, 7 * wordSize));
4576   movq(rdi, Address(rsp, 8 * wordSize));
4577   movq(rsi, Address(rsp, 9 * wordSize));
4578   movq(rbp, Address(rsp, 10 * wordSize));
4579   // skip rsp
4580   movq(rbx, Address(rsp, 12 * wordSize));
4581   movq(rdx, Address(rsp, 13 * wordSize));
4582   movq(rcx, Address(rsp, 14 * wordSize));
4583   movq(rax, Address(rsp, 15 * wordSize));
4584 
4585   addq(rsp, 16 * wordSize);
4586 }
4587 
4588 void Assembler::popcntq(Register dst, Address src) {
4589   assert(VM_Version::supports_popcnt(), "must support");
4590   InstructionMark im(this);
4591   emit_byte(0xF3);
4592   prefixq(src, dst);
4593   emit_byte(0x0F);
4594   emit_byte(0xB8);
4595   emit_operand(dst, src);
4596 }
4597 
4598 void Assembler::popcntq(Register dst, Register src) {
4599   assert(VM_Version::supports_popcnt(), "must support");
4600   emit_byte(0xF3);
4601   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4602   emit_byte(0x0F);
4603   emit_byte(0xB8);
4604   emit_byte(0xC0 | encode);
4605 }
4606 
4607 void Assembler::popq(Address dst) {
4608   InstructionMark im(this);
4609   prefixq(dst);
4610   emit_byte(0x8F);
4611   emit_operand(rax, dst);
4612 }
4613 
4614 void Assembler::pusha() { // 64bit
4615   // we have to store original rsp.  ABI says that 128 bytes
4616   // below rsp are local scratch.
4617   movq(Address(rsp, -5 * wordSize), rsp);
4618 
4619   subq(rsp, 16 * wordSize);
4620 
4621   movq(Address(rsp, 15 * wordSize), rax);
4622   movq(Address(rsp, 14 * wordSize), rcx);
4623   movq(Address(rsp, 13 * wordSize), rdx);
4624   movq(Address(rsp, 12 * wordSize), rbx);
4625   // skip rsp
4626   movq(Address(rsp, 10 * wordSize), rbp);
4627   movq(Address(rsp, 9 * wordSize), rsi);
4628   movq(Address(rsp, 8 * wordSize), rdi);
4629   movq(Address(rsp, 7 * wordSize), r8);
4630   movq(Address(rsp, 6 * wordSize), r9);
4631   movq(Address(rsp, 5 * wordSize), r10);
4632   movq(Address(rsp, 4 * wordSize), r11);
4633   movq(Address(rsp, 3 * wordSize), r12);
4634   movq(Address(rsp, 2 * wordSize), r13);
4635   movq(Address(rsp, wordSize), r14);
4636   movq(Address(rsp, 0), r15);
4637 }
4638 
4639 void Assembler::pushq(Address src) {
4640   InstructionMark im(this);
4641   prefixq(src);
4642   emit_byte(0xFF);
4643   emit_operand(rsi, src);
4644 }
4645 
4646 void Assembler::rclq(Register dst, int imm8) {
4647   assert(isShiftCount(imm8 >> 1), "illegal shift count");
4648   int encode = prefixq_and_encode(dst->encoding());
4649   if (imm8 == 1) {
4650     emit_byte(0xD1);
4651     emit_byte(0xD0 | encode);
4652   } else {
4653     emit_byte(0xC1);
4654     emit_byte(0xD0 | encode);
4655     emit_byte(imm8);
4656   }
4657 }
4658 void Assembler::sarq(Register dst, int imm8) {
4659   assert(isShiftCount(imm8 >> 1), "illegal shift count");
4660   int encode = prefixq_and_encode(dst->encoding());
4661   if (imm8 == 1) {
4662     emit_byte(0xD1);
4663     emit_byte(0xF8 | encode);
4664   } else {
4665     emit_byte(0xC1);
4666     emit_byte(0xF8 | encode);
4667     emit_byte(imm8);
4668   }
4669 }
4670 
4671 void Assembler::sarq(Register dst) {
4672   int encode = prefixq_and_encode(dst->encoding());
4673   emit_byte(0xD3);
4674   emit_byte(0xF8 | encode);
4675 }
4676 
4677 void Assembler::sbbq(Address dst, int32_t imm32) {
4678   InstructionMark im(this);
4679   prefixq(dst);
4680   emit_arith_operand(0x81, rbx, dst, imm32);
4681 }
4682 
4683 void Assembler::sbbq(Register dst, int32_t imm32) {
4684   (void) prefixq_and_encode(dst->encoding());
4685   emit_arith(0x81, 0xD8, dst, imm32);
4686 }
4687 
4688 void Assembler::sbbq(Register dst, Address src) {
4689   InstructionMark im(this);
4690   prefixq(src, dst);
4691   emit_byte(0x1B);
4692   emit_operand(dst, src);
4693 }
4694 
4695 void Assembler::sbbq(Register dst, Register src) {
4696   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4697   emit_arith(0x1B, 0xC0, dst, src);
4698 }
4699 
4700 void Assembler::shlq(Register dst, int imm8) {
4701   assert(isShiftCount(imm8 >> 1), "illegal shift count");
4702   int encode = prefixq_and_encode(dst->encoding());
4703   if (imm8 == 1) {
4704     emit_byte(0xD1);
4705     emit_byte(0xE0 | encode);
4706   } else {
4707     emit_byte(0xC1);
4708     emit_byte(0xE0 | encode);
4709     emit_byte(imm8);
4710   }
4711 }
4712 
4713 void Assembler::shlq(Register dst) {
4714   int encode = prefixq_and_encode(dst->encoding());
4715   emit_byte(0xD3);
4716   emit_byte(0xE0 | encode);
4717 }
4718 
4719 void Assembler::shrq(Register dst, int imm8) {
4720   assert(isShiftCount(imm8 >> 1), "illegal shift count");
4721   int encode = prefixq_and_encode(dst->encoding());
4722   emit_byte(0xC1);
4723   emit_byte(0xE8 | encode);
4724   emit_byte(imm8);
4725 }
4726 
4727 void Assembler::shrq(Register dst) {
4728   int encode = prefixq_and_encode(dst->encoding());
4729   emit_byte(0xD3);
4730   emit_byte(0xE8 | encode);
4731 }
4732 
4733 void Assembler::subq(Address dst, int32_t imm32) {
4734   InstructionMark im(this);
4735   prefixq(dst);
4736   emit_arith_operand(0x81, rbp, dst, imm32);
4737 }
4738 
4739 void Assembler::subq(Address dst, Register src) {
4740   InstructionMark im(this);
4741   prefixq(dst, src);
4742   emit_byte(0x29);
4743   emit_operand(src, dst);
4744 }
4745 
4746 void Assembler::subq(Register dst, int32_t imm32) {
4747   (void) prefixq_and_encode(dst->encoding());
4748   emit_arith(0x81, 0xE8, dst, imm32);
4749 }
4750 
4751 void Assembler::subq(Register dst, Address src) {
4752   InstructionMark im(this);
4753   prefixq(src, dst);
4754   emit_byte(0x2B);
4755   emit_operand(dst, src);
4756 }
4757 
4758 void Assembler::subq(Register dst, Register src) {
4759   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4760   emit_arith(0x2B, 0xC0, dst, src);
4761 }
4762 
4763 void Assembler::testq(Register dst, int32_t imm32) {
4764   // not using emit_arith because test
4765   // doesn't support sign-extension of
4766   // 8bit operands
4767   int encode = dst->encoding();
4768   if (encode == 0) {
4769     prefix(REX_W);
4770     emit_byte(0xA9);
4771   } else {
4772     encode = prefixq_and_encode(encode);
4773     emit_byte(0xF7);
4774     emit_byte(0xC0 | encode);
4775   }
4776   emit_long(imm32);
4777 }
4778 
4779 void Assembler::testq(Register dst, Register src) {
4780   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4781   emit_arith(0x85, 0xC0, dst, src);
4782 }
4783 
4784 void Assembler::xaddq(Address dst, Register src) {
4785   InstructionMark im(this);
4786   prefixq(dst, src);
4787   emit_byte(0x0F);
4788   emit_byte(0xC1);
4789   emit_operand(src, dst);
4790 }
4791 
4792 void Assembler::xchgq(Register dst, Address src) {
4793   InstructionMark im(this);
4794   prefixq(src, dst);
4795   emit_byte(0x87);
4796   emit_operand(dst, src);
4797 }
4798 
4799 void Assembler::xchgq(Register dst, Register src) {
4800   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4801   emit_byte(0x87);
4802   emit_byte(0xc0 | encode);
4803 }
4804 
4805 void Assembler::xorq(Register dst, Register src) {
4806   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4807   emit_arith(0x33, 0xC0, dst, src);
4808 }
4809 
4810 void Assembler::xorq(Register dst, Address src) {
4811   InstructionMark im(this);
4812   prefixq(src, dst);
4813   emit_byte(0x33);
4814   emit_operand(dst, src);
4815 }
4816 
4817 #endif // !LP64
4818 
4819 static Assembler::Condition reverse[] = {
4820     Assembler::noOverflow     /* overflow      = 0x0 */ ,
4821     Assembler::overflow       /* noOverflow    = 0x1 */ ,
4822     Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
4823     Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
4824     Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
4825     Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
4826     Assembler::above          /* belowEqual    = 0x6 */ ,
4827     Assembler::belowEqual     /* above         = 0x7 */ ,
4828     Assembler::positive       /* negative      = 0x8 */ ,
4829     Assembler::negative       /* positive      = 0x9 */ ,
4830     Assembler::noParity       /* parity        = 0xa */ ,
4831     Assembler::parity         /* noParity      = 0xb */ ,
4832     Assembler::greaterEqual   /* less          = 0xc */ ,
4833     Assembler::less           /* greaterEqual  = 0xd */ ,
4834     Assembler::greater        /* lessEqual     = 0xe */ ,
4835     Assembler::lessEqual      /* greater       = 0xf, */
4836 
4837 };
4838 
4839 
4840 // Implementation of MacroAssembler
4841 
4842 // First all the versions that have distinct versions depending on 32/64 bit
4843 // Unless the difference is trivial (1 line or so).
4844 
4845 #ifndef _LP64
4846 
4847 // 32bit versions
4848 
4849 Address MacroAssembler::as_Address(AddressLiteral adr) {
4850   return Address(adr.target(), adr.rspec());
4851 }
4852 
4853 Address MacroAssembler::as_Address(ArrayAddress adr) {
4854   return Address::make_array(adr);
4855 }
4856 
4857 int MacroAssembler::biased_locking_enter(Register lock_reg,
4858                                          Register obj_reg,
4859                                          Register swap_reg,
4860                                          Register tmp_reg,
4861                                          bool swap_reg_contains_mark,
4862                                          Label& done,
4863                                          Label* slow_case,
4864                                          BiasedLockingCounters* counters) {
4865   assert(UseBiasedLocking, "why call this otherwise?");
4866   assert(swap_reg == rax, "swap_reg must be rax, for cmpxchg");
4867   assert_different_registers(lock_reg, obj_reg, swap_reg);
4868 
4869   if (PrintBiasedLockingStatistics && counters == NULL)
4870     counters = BiasedLocking::counters();
4871 
4872   bool need_tmp_reg = false;
4873   if (tmp_reg == noreg) {
4874     need_tmp_reg = true;
4875     tmp_reg = lock_reg;
4876   } else {
4877     assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
4878   }
4879   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
4880   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
4881   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
4882   Address saved_mark_addr(lock_reg, 0);
4883 
4884   // Biased locking
4885   // See whether the lock is currently biased toward our thread and
4886   // whether the epoch is still valid
4887   // Note that the runtime guarantees sufficient alignment of JavaThread
4888   // pointers to allow age to be placed into low bits
4889   // First check to see whether biasing is even enabled for this object
4890   Label cas_label;
4891   int null_check_offset = -1;
4892   if (!swap_reg_contains_mark) {
4893     null_check_offset = offset();
4894     movl(swap_reg, mark_addr);
4895   }
4896   if (need_tmp_reg) {
4897     push(tmp_reg);
4898   }
4899   movl(tmp_reg, swap_reg);
4900   andl(tmp_reg, markOopDesc::biased_lock_mask_in_place);
4901   cmpl(tmp_reg, markOopDesc::biased_lock_pattern);
4902   if (need_tmp_reg) {
4903     pop(tmp_reg);
4904   }
4905   jcc(Assembler::notEqual, cas_label);
4906   // The bias pattern is present in the object's header. Need to check
4907   // whether the bias owner and the epoch are both still current.
4908   // Note that because there is no current thread register on x86 we
4909   // need to store off the mark word we read out of the object to
4910   // avoid reloading it and needing to recheck invariants below. This
4911   // store is unfortunate but it makes the overall code shorter and
4912   // simpler.
4913   movl(saved_mark_addr, swap_reg);
4914   if (need_tmp_reg) {
4915     push(tmp_reg);
4916   }
4917   get_thread(tmp_reg);
4918   xorl(swap_reg, tmp_reg);
4919   if (swap_reg_contains_mark) {
4920     null_check_offset = offset();
4921   }
4922   movl(tmp_reg, klass_addr);
4923   xorl(swap_reg, Address(tmp_reg, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
4924   andl(swap_reg, ~((int) markOopDesc::age_mask_in_place));
4925   if (need_tmp_reg) {
4926     pop(tmp_reg);
4927   }
4928   if (counters != NULL) {
4929     cond_inc32(Assembler::zero,
4930                ExternalAddress((address)counters->biased_lock_entry_count_addr()));
4931   }
4932   jcc(Assembler::equal, done);
4933 
4934   Label try_revoke_bias;
4935   Label try_rebias;
4936 
4937   // At this point we know that the header has the bias pattern and
4938   // that we are not the bias owner in the current epoch. We need to
4939   // figure out more details about the state of the header in order to
4940   // know what operations can be legally performed on the object's
4941   // header.
4942 
4943   // If the low three bits in the xor result aren't clear, that means
4944   // the prototype header is no longer biased and we have to revoke
4945   // the bias on this object.
4946   testl(swap_reg, markOopDesc::biased_lock_mask_in_place);
4947   jcc(Assembler::notZero, try_revoke_bias);
4948 
4949   // Biasing is still enabled for this data type. See whether the
4950   // epoch of the current bias is still valid, meaning that the epoch
4951   // bits of the mark word are equal to the epoch bits of the
4952   // prototype header. (Note that the prototype header's epoch bits
4953   // only change at a safepoint.) If not, attempt to rebias the object
4954   // toward the current thread. Note that we must be absolutely sure
4955   // that the current epoch is invalid in order to do this because
4956   // otherwise the manipulations it performs on the mark word are
4957   // illegal.
4958   testl(swap_reg, markOopDesc::epoch_mask_in_place);
4959   jcc(Assembler::notZero, try_rebias);
4960 
4961   // The epoch of the current bias is still valid but we know nothing
4962   // about the owner; it might be set or it might be clear. Try to
4963   // acquire the bias of the object using an atomic operation. If this
4964   // fails we will go in to the runtime to revoke the object's bias.
4965   // Note that we first construct the presumed unbiased header so we
4966   // don't accidentally blow away another thread's valid bias.
4967   movl(swap_reg, saved_mark_addr);
4968   andl(swap_reg,
4969        markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
4970   if (need_tmp_reg) {
4971     push(tmp_reg);
4972   }
4973   get_thread(tmp_reg);
4974   orl(tmp_reg, swap_reg);
4975   if (os::is_MP()) {
4976     lock();
4977   }
4978   cmpxchgptr(tmp_reg, Address(obj_reg, 0));
4979   if (need_tmp_reg) {
4980     pop(tmp_reg);
4981   }
4982   // If the biasing toward our thread failed, this means that
4983   // another thread succeeded in biasing it toward itself and we
4984   // need to revoke that bias. The revocation will occur in the
4985   // interpreter runtime in the slow case.
4986   if (counters != NULL) {
4987     cond_inc32(Assembler::zero,
4988                ExternalAddress((address)counters->anonymously_biased_lock_entry_count_addr()));
4989   }
4990   if (slow_case != NULL) {
4991     jcc(Assembler::notZero, *slow_case);
4992   }
4993   jmp(done);
4994 
4995   bind(try_rebias);
4996   // At this point we know the epoch has expired, meaning that the
4997   // current "bias owner", if any, is actually invalid. Under these
4998   // circumstances _only_, we are allowed to use the current header's
4999   // value as the comparison value when doing the cas to acquire the
5000   // bias in the current epoch. In other words, we allow transfer of
5001   // the bias from one thread to another directly in this situation.
5002   //
5003   // FIXME: due to a lack of registers we currently blow away the age
5004   // bits in this situation. Should attempt to preserve them.
5005   if (need_tmp_reg) {
5006     push(tmp_reg);
5007   }
5008   get_thread(tmp_reg);
5009   movl(swap_reg, klass_addr);
5010   orl(tmp_reg, Address(swap_reg, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
5011   movl(swap_reg, saved_mark_addr);
5012   if (os::is_MP()) {
5013     lock();
5014   }
5015   cmpxchgptr(tmp_reg, Address(obj_reg, 0));
5016   if (need_tmp_reg) {
5017     pop(tmp_reg);
5018   }
5019   // If the biasing toward our thread failed, then another thread
5020   // succeeded in biasing it toward itself and we need to revoke that
5021   // bias. The revocation will occur in the runtime in the slow case.
5022   if (counters != NULL) {
5023     cond_inc32(Assembler::zero,
5024                ExternalAddress((address)counters->rebiased_lock_entry_count_addr()));
5025   }
5026   if (slow_case != NULL) {
5027     jcc(Assembler::notZero, *slow_case);
5028   }
5029   jmp(done);
5030 
5031   bind(try_revoke_bias);
5032   // The prototype mark in the klass doesn't have the bias bit set any
5033   // more, indicating that objects of this data type are not supposed
5034   // to be biased any more. We are going to try to reset the mark of
5035   // this object to the prototype value and fall through to the
5036   // CAS-based locking scheme. Note that if our CAS fails, it means
5037   // that another thread raced us for the privilege of revoking the
5038   // bias of this particular object, so it's okay to continue in the
5039   // normal locking code.
5040   //
5041   // FIXME: due to a lack of registers we currently blow away the age
5042   // bits in this situation. Should attempt to preserve them.
5043   movl(swap_reg, saved_mark_addr);
5044   if (need_tmp_reg) {
5045     push(tmp_reg);
5046   }
5047   movl(tmp_reg, klass_addr);
5048   movl(tmp_reg, Address(tmp_reg, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
5049   if (os::is_MP()) {
5050     lock();
5051   }
5052   cmpxchgptr(tmp_reg, Address(obj_reg, 0));
5053   if (need_tmp_reg) {
5054     pop(tmp_reg);
5055   }
5056   // Fall through to the normal CAS-based lock, because no matter what
5057   // the result of the above CAS, some thread must have succeeded in
5058   // removing the bias bit from the object's header.
5059   if (counters != NULL) {
5060     cond_inc32(Assembler::zero,
5061                ExternalAddress((address)counters->revoked_lock_entry_count_addr()));
5062   }
5063 
5064   bind(cas_label);
5065 
5066   return null_check_offset;
5067 }
5068 void MacroAssembler::call_VM_leaf_base(address entry_point,
5069                                        int number_of_arguments) {
5070   call(RuntimeAddress(entry_point));
5071   increment(rsp, number_of_arguments * wordSize);
5072 }
5073 
5074 void MacroAssembler::cmpoop(Address src1, jobject obj) {
5075   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
5076 }
5077 
5078 void MacroAssembler::cmpoop(Register src1, jobject obj) {
5079   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
5080 }
5081 
5082 void MacroAssembler::extend_sign(Register hi, Register lo) {
5083   // According to Intel Doc. AP-526, "Integer Divide", p.18.
5084   if (VM_Version::is_P6() && hi == rdx && lo == rax) {
5085     cdql();
5086   } else {
5087     movl(hi, lo);
5088     sarl(hi, 31);
5089   }
5090 }
5091 
5092 void MacroAssembler::fat_nop() {
5093   // A 5 byte nop that is safe for patching (see patch_verified_entry)
5094   emit_byte(0x26); // es:
5095   emit_byte(0x2e); // cs:
5096   emit_byte(0x64); // fs:
5097   emit_byte(0x65); // gs:
5098   emit_byte(0x90);
5099 }
5100 
5101 void MacroAssembler::jC2(Register tmp, Label& L) {
5102   // set parity bit if FPU flag C2 is set (via rax)
5103   save_rax(tmp);
5104   fwait(); fnstsw_ax();
5105   sahf();
5106   restore_rax(tmp);
5107   // branch
5108   jcc(Assembler::parity, L);
5109 }
5110 
5111 void MacroAssembler::jnC2(Register tmp, Label& L) {
5112   // set parity bit if FPU flag C2 is set (via rax)
5113   save_rax(tmp);
5114   fwait(); fnstsw_ax();
5115   sahf();
5116   restore_rax(tmp);
5117   // branch
5118   jcc(Assembler::noParity, L);
5119 }
5120 
5121 // 32bit can do a case table jump in one instruction but we no longer allow the base
5122 // to be installed in the Address class
5123 void MacroAssembler::jump(ArrayAddress entry) {
5124   jmp(as_Address(entry));
5125 }
5126 
5127 // Note: y_lo will be destroyed
5128 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
5129   // Long compare for Java (semantics as described in JVM spec.)
5130   Label high, low, done;
5131 
5132   cmpl(x_hi, y_hi);
5133   jcc(Assembler::less, low);
5134   jcc(Assembler::greater, high);
5135   // x_hi is the return register
5136   xorl(x_hi, x_hi);
5137   cmpl(x_lo, y_lo);
5138   jcc(Assembler::below, low);
5139   jcc(Assembler::equal, done);
5140 
5141   bind(high);
5142   xorl(x_hi, x_hi);
5143   increment(x_hi);
5144   jmp(done);
5145 
5146   bind(low);
5147   xorl(x_hi, x_hi);
5148   decrementl(x_hi);
5149 
5150   bind(done);
5151 }
5152 
5153 void MacroAssembler::lea(Register dst, AddressLiteral src) {
5154     mov_literal32(dst, (int32_t)src.target(), src.rspec());
5155 }
5156 
5157 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
5158   // leal(dst, as_Address(adr));
5159   // see note in movl as to why we must use a move
5160   mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
5161 }
5162 
5163 void MacroAssembler::leave() {
5164   mov(rsp, rbp);
5165   pop(rbp);
5166 }
5167 
5168 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
5169   // Multiplication of two Java long values stored on the stack
5170   // as illustrated below. Result is in rdx:rax.
5171   //
5172   // rsp ---> [  ??  ] \               \
5173   //            ....    | y_rsp_offset  |
5174   //          [ y_lo ] /  (in bytes)    | x_rsp_offset
5175   //          [ y_hi ]                  | (in bytes)
5176   //            ....                    |
5177   //          [ x_lo ]                 /
5178   //          [ x_hi ]
5179   //            ....
5180   //
5181   // Basic idea: lo(result) = lo(x_lo * y_lo)
5182   //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
5183   Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
5184   Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
5185   Label quick;
5186   // load x_hi, y_hi and check if quick
5187   // multiplication is possible
5188   movl(rbx, x_hi);
5189   movl(rcx, y_hi);
5190   movl(rax, rbx);
5191   orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
5192   jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
5193   // do full multiplication
5194   // 1st step
5195   mull(y_lo);                                    // x_hi * y_lo
5196   movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
5197   // 2nd step
5198   movl(rax, x_lo);
5199   mull(rcx);                                     // x_lo * y_hi
5200   addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
5201   // 3rd step
5202   bind(quick);                                   // note: rbx, = 0 if quick multiply!
5203   movl(rax, x_lo);
5204   mull(y_lo);                                    // x_lo * y_lo
5205   addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
5206 }
5207 
5208 void MacroAssembler::lneg(Register hi, Register lo) {
5209   negl(lo);
5210   adcl(hi, 0);
5211   negl(hi);
5212 }
5213 
5214 void MacroAssembler::lshl(Register hi, Register lo) {
5215   // Java shift left long support (semantics as described in JVM spec., p.305)
5216   // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
5217   // shift value is in rcx !
5218   assert(hi != rcx, "must not use rcx");
5219   assert(lo != rcx, "must not use rcx");
5220   const Register s = rcx;                        // shift count
5221   const int      n = BitsPerWord;
5222   Label L;
5223   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
5224   cmpl(s, n);                                    // if (s < n)
5225   jcc(Assembler::less, L);                       // else (s >= n)
5226   movl(hi, lo);                                  // x := x << n
5227   xorl(lo, lo);
5228   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
5229   bind(L);                                       // s (mod n) < n
5230   shldl(hi, lo);                                 // x := x << s
5231   shll(lo);
5232 }
5233 
5234 
5235 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
5236   // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
5237   // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
5238   assert(hi != rcx, "must not use rcx");
5239   assert(lo != rcx, "must not use rcx");
5240   const Register s = rcx;                        // shift count
5241   const int      n = BitsPerWord;
5242   Label L;
5243   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
5244   cmpl(s, n);                                    // if (s < n)
5245   jcc(Assembler::less, L);                       // else (s >= n)
5246   movl(lo, hi);                                  // x := x >> n
5247   if (sign_extension) sarl(hi, 31);
5248   else                xorl(hi, hi);
5249   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
5250   bind(L);                                       // s (mod n) < n
5251   shrdl(lo, hi);                                 // x := x >> s
5252   if (sign_extension) sarl(hi);
5253   else                shrl(hi);
5254 }
5255 
5256 void MacroAssembler::movoop(Register dst, jobject obj) {
5257   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
5258 }
5259 
5260 void MacroAssembler::movoop(Address dst, jobject obj) {
5261   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
5262 }
5263 
5264 void MacroAssembler::movptr(Register dst, AddressLiteral src) {
5265   if (src.is_lval()) {
5266     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
5267   } else {
5268     movl(dst, as_Address(src));
5269   }
5270 }
5271 
5272 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
5273   movl(as_Address(dst), src);
5274 }
5275 
5276 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
5277   movl(dst, as_Address(src));
5278 }
5279 
5280 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
5281 void MacroAssembler::movptr(Address dst, intptr_t src) {
5282   movl(dst, src);
5283 }
5284 
5285 
5286 void MacroAssembler::pop_callee_saved_registers() {
5287   pop(rcx);
5288   pop(rdx);
5289   pop(rdi);
5290   pop(rsi);
5291 }
5292 
5293 void MacroAssembler::pop_fTOS() {
5294   fld_d(Address(rsp, 0));
5295   addl(rsp, 2 * wordSize);
5296 }
5297 
5298 void MacroAssembler::push_callee_saved_registers() {
5299   push(rsi);
5300   push(rdi);
5301   push(rdx);
5302   push(rcx);
5303 }
5304 
5305 void MacroAssembler::push_fTOS() {
5306   subl(rsp, 2 * wordSize);
5307   fstp_d(Address(rsp, 0));
5308 }
5309 
5310 
5311 void MacroAssembler::pushoop(jobject obj) {
5312   push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
5313 }
5314 
5315 
5316 void MacroAssembler::pushptr(AddressLiteral src) {
5317   if (src.is_lval()) {
5318     push_literal32((int32_t)src.target(), src.rspec());
5319   } else {
5320     pushl(as_Address(src));
5321   }
5322 }
5323 
5324 void MacroAssembler::set_word_if_not_zero(Register dst) {
5325   xorl(dst, dst);
5326   set_byte_if_not_zero(dst);
5327 }
5328 
5329 static void pass_arg0(MacroAssembler* masm, Register arg) {
5330   masm->push(arg);
5331 }
5332 
5333 static void pass_arg1(MacroAssembler* masm, Register arg) {
5334   masm->push(arg);
5335 }
5336 
5337 static void pass_arg2(MacroAssembler* masm, Register arg) {
5338   masm->push(arg);
5339 }
5340 
5341 static void pass_arg3(MacroAssembler* masm, Register arg) {
5342   masm->push(arg);
5343 }
5344 
5345 #ifndef PRODUCT
5346 extern "C" void findpc(intptr_t x);
5347 #endif
5348 
5349 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
5350   // In order to get locks to work, we need to fake a in_VM state
5351   JavaThread* thread = JavaThread::current();
5352   JavaThreadState saved_state = thread->thread_state();
5353   thread->set_thread_state(_thread_in_vm);
5354   if (ShowMessageBoxOnError) {
5355     JavaThread* thread = JavaThread::current();
5356     JavaThreadState saved_state = thread->thread_state();
5357     thread->set_thread_state(_thread_in_vm);
5358     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
5359       ttyLocker ttyl;
5360       BytecodeCounter::print();
5361     }
5362     // To see where a verify_oop failed, get $ebx+40/X for this frame.
5363     // This is the value of eip which points to where verify_oop will return.
5364     if (os::message_box(msg, "Execution stopped, print registers?")) {
5365       ttyLocker ttyl;
5366       tty->print_cr("eip = 0x%08x", eip);
5367 #ifndef PRODUCT
5368       if ((WizardMode || Verbose) && PrintMiscellaneous) {
5369         tty->cr();
5370         findpc(eip);
5371         tty->cr();
5372       }
5373 #endif
5374       tty->print_cr("rax = 0x%08x", rax);
5375       tty->print_cr("rbx = 0x%08x", rbx);
5376       tty->print_cr("rcx = 0x%08x", rcx);
5377       tty->print_cr("rdx = 0x%08x", rdx);
5378       tty->print_cr("rdi = 0x%08x", rdi);
5379       tty->print_cr("rsi = 0x%08x", rsi);
5380       tty->print_cr("rbp = 0x%08x", rbp);
5381       tty->print_cr("rsp = 0x%08x", rsp);
5382       BREAKPOINT;
5383       assert(false, "start up GDB");
5384     }
5385   } else {
5386     ttyLocker ttyl;
5387     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
5388     assert(false, err_msg("DEBUG MESSAGE: %s", msg));
5389   }
5390   ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
5391 }
5392 
5393 void MacroAssembler::stop(const char* msg) {
5394   ExternalAddress message((address)msg);
5395   // push address of message
5396   pushptr(message.addr());
5397   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
5398   pusha();                                           // push registers
5399   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
5400   hlt();
5401 }
5402 
5403 void MacroAssembler::warn(const char* msg) {
5404   push_CPU_state();
5405 
5406   ExternalAddress message((address) msg);
5407   // push address of message
5408   pushptr(message.addr());
5409 
5410   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
5411   addl(rsp, wordSize);       // discard argument
5412   pop_CPU_state();
5413 }
5414 
5415 #else // _LP64
5416 
5417 // 64 bit versions
5418 
5419 Address MacroAssembler::as_Address(AddressLiteral adr) {
5420   // amd64 always does this as a pc-rel
5421   // we can be absolute or disp based on the instruction type
5422   // jmp/call are displacements others are absolute
5423   assert(!adr.is_lval(), "must be rval");
5424   assert(reachable(adr), "must be");
5425   return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
5426 
5427 }
5428 
5429 Address MacroAssembler::as_Address(ArrayAddress adr) {
5430   AddressLiteral base = adr.base();
5431   lea(rscratch1, base);
5432   Address index = adr.index();
5433   assert(index._disp == 0, "must not have disp"); // maybe it can?
5434   Address array(rscratch1, index._index, index._scale, index._disp);
5435   return array;
5436 }
5437 
5438 int MacroAssembler::biased_locking_enter(Register lock_reg,
5439                                          Register obj_reg,
5440                                          Register swap_reg,
5441                                          Register tmp_reg,
5442                                          bool swap_reg_contains_mark,
5443                                          Label& done,
5444                                          Label* slow_case,
5445                                          BiasedLockingCounters* counters) {
5446   assert(UseBiasedLocking, "why call this otherwise?");
5447   assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
5448   assert(tmp_reg != noreg, "tmp_reg must be supplied");
5449   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
5450   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
5451   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
5452   Address saved_mark_addr(lock_reg, 0);
5453 
5454   if (PrintBiasedLockingStatistics && counters == NULL)
5455     counters = BiasedLocking::counters();
5456 
5457   // Biased locking
5458   // See whether the lock is currently biased toward our thread and
5459   // whether the epoch is still valid
5460   // Note that the runtime guarantees sufficient alignment of JavaThread
5461   // pointers to allow age to be placed into low bits
5462   // First check to see whether biasing is even enabled for this object
5463   Label cas_label;
5464   int null_check_offset = -1;
5465   if (!swap_reg_contains_mark) {
5466     null_check_offset = offset();
5467     movq(swap_reg, mark_addr);
5468   }
5469   movq(tmp_reg, swap_reg);
5470   andq(tmp_reg, markOopDesc::biased_lock_mask_in_place);
5471   cmpq(tmp_reg, markOopDesc::biased_lock_pattern);
5472   jcc(Assembler::notEqual, cas_label);
5473   // The bias pattern is present in the object's header. Need to check
5474   // whether the bias owner and the epoch are both still current.
5475   load_prototype_header(tmp_reg, obj_reg);
5476   orq(tmp_reg, r15_thread);
5477   xorq(tmp_reg, swap_reg);
5478   andq(tmp_reg, ~((int) markOopDesc::age_mask_in_place));
5479   if (counters != NULL) {
5480     cond_inc32(Assembler::zero,
5481                ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
5482   }
5483   jcc(Assembler::equal, done);
5484 
5485   Label try_revoke_bias;
5486   Label try_rebias;
5487 
5488   // At this point we know that the header has the bias pattern and
5489   // that we are not the bias owner in the current epoch. We need to
5490   // figure out more details about the state of the header in order to
5491   // know what operations can be legally performed on the object's
5492   // header.
5493 
5494   // If the low three bits in the xor result aren't clear, that means
5495   // the prototype header is no longer biased and we have to revoke
5496   // the bias on this object.
5497   testq(tmp_reg, markOopDesc::biased_lock_mask_in_place);
5498   jcc(Assembler::notZero, try_revoke_bias);
5499 
5500   // Biasing is still enabled for this data type. See whether the
5501   // epoch of the current bias is still valid, meaning that the epoch
5502   // bits of the mark word are equal to the epoch bits of the
5503   // prototype header. (Note that the prototype header's epoch bits
5504   // only change at a safepoint.) If not, attempt to rebias the object
5505   // toward the current thread. Note that we must be absolutely sure
5506   // that the current epoch is invalid in order to do this because
5507   // otherwise the manipulations it performs on the mark word are
5508   // illegal.
5509   testq(tmp_reg, markOopDesc::epoch_mask_in_place);
5510   jcc(Assembler::notZero, try_rebias);
5511 
5512   // The epoch of the current bias is still valid but we know nothing
5513   // about the owner; it might be set or it might be clear. Try to
5514   // acquire the bias of the object using an atomic operation. If this
5515   // fails we will go in to the runtime to revoke the object's bias.
5516   // Note that we first construct the presumed unbiased header so we
5517   // don't accidentally blow away another thread's valid bias.
5518   andq(swap_reg,
5519        markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
5520   movq(tmp_reg, swap_reg);
5521   orq(tmp_reg, r15_thread);
5522   if (os::is_MP()) {
5523     lock();
5524   }
5525   cmpxchgq(tmp_reg, Address(obj_reg, 0));
5526   // If the biasing toward our thread failed, this means that
5527   // another thread succeeded in biasing it toward itself and we
5528   // need to revoke that bias. The revocation will occur in the
5529   // interpreter runtime in the slow case.
5530   if (counters != NULL) {
5531     cond_inc32(Assembler::zero,
5532                ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
5533   }
5534   if (slow_case != NULL) {
5535     jcc(Assembler::notZero, *slow_case);
5536   }
5537   jmp(done);
5538 
5539   bind(try_rebias);
5540   // At this point we know the epoch has expired, meaning that the
5541   // current "bias owner", if any, is actually invalid. Under these
5542   // circumstances _only_, we are allowed to use the current header's
5543   // value as the comparison value when doing the cas to acquire the
5544   // bias in the current epoch. In other words, we allow transfer of
5545   // the bias from one thread to another directly in this situation.
5546   //
5547   // FIXME: due to a lack of registers we currently blow away the age
5548   // bits in this situation. Should attempt to preserve them.
5549   load_prototype_header(tmp_reg, obj_reg);
5550   orq(tmp_reg, r15_thread);
5551   if (os::is_MP()) {
5552     lock();
5553   }
5554   cmpxchgq(tmp_reg, Address(obj_reg, 0));
5555   // If the biasing toward our thread failed, then another thread
5556   // succeeded in biasing it toward itself and we need to revoke that
5557   // bias. The revocation will occur in the runtime in the slow case.
5558   if (counters != NULL) {
5559     cond_inc32(Assembler::zero,
5560                ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
5561   }
5562   if (slow_case != NULL) {
5563     jcc(Assembler::notZero, *slow_case);
5564   }
5565   jmp(done);
5566 
5567   bind(try_revoke_bias);
5568   // The prototype mark in the klass doesn't have the bias bit set any
5569   // more, indicating that objects of this data type are not supposed
5570   // to be biased any more. We are going to try to reset the mark of
5571   // this object to the prototype value and fall through to the
5572   // CAS-based locking scheme. Note that if our CAS fails, it means
5573   // that another thread raced us for the privilege of revoking the
5574   // bias of this particular object, so it's okay to continue in the
5575   // normal locking code.
5576   //
5577   // FIXME: due to a lack of registers we currently blow away the age
5578   // bits in this situation. Should attempt to preserve them.
5579   load_prototype_header(tmp_reg, obj_reg);
5580   if (os::is_MP()) {
5581     lock();
5582   }
5583   cmpxchgq(tmp_reg, Address(obj_reg, 0));
5584   // Fall through to the normal CAS-based lock, because no matter what
5585   // the result of the above CAS, some thread must have succeeded in
5586   // removing the bias bit from the object's header.
5587   if (counters != NULL) {
5588     cond_inc32(Assembler::zero,
5589                ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
5590   }
5591 
5592   bind(cas_label);
5593 
5594   return null_check_offset;
5595 }
5596 
5597 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
5598   Label L, E;
5599 
5600 #ifdef _WIN64
5601   // Windows always allocates space for it's register args
5602   assert(num_args <= 4, "only register arguments supported");
5603   subq(rsp,  frame::arg_reg_save_area_bytes);
5604 #endif
5605 
5606   // Align stack if necessary
5607   testl(rsp, 15);
5608   jcc(Assembler::zero, L);
5609 
5610   subq(rsp, 8);
5611   {
5612     call(RuntimeAddress(entry_point));
5613   }
5614   addq(rsp, 8);
5615   jmp(E);
5616 
5617   bind(L);
5618   {
5619     call(RuntimeAddress(entry_point));
5620   }
5621 
5622   bind(E);
5623 
5624 #ifdef _WIN64
5625   // restore stack pointer
5626   addq(rsp, frame::arg_reg_save_area_bytes);
5627 #endif
5628 
5629 }
5630 
5631 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
5632   assert(!src2.is_lval(), "should use cmpptr");
5633 
5634   if (reachable(src2)) {
5635     cmpq(src1, as_Address(src2));
5636   } else {
5637     lea(rscratch1, src2);
5638     Assembler::cmpq(src1, Address(rscratch1, 0));
5639   }
5640 }
5641 
5642 int MacroAssembler::corrected_idivq(Register reg) {
5643   // Full implementation of Java ldiv and lrem; checks for special
5644   // case as described in JVM spec., p.243 & p.271.  The function
5645   // returns the (pc) offset of the idivl instruction - may be needed
5646   // for implicit exceptions.
5647   //
5648   //         normal case                           special case
5649   //
5650   // input : rax: dividend                         min_long
5651   //         reg: divisor   (may not be eax/edx)   -1
5652   //
5653   // output: rax: quotient  (= rax idiv reg)       min_long
5654   //         rdx: remainder (= rax irem reg)       0
5655   assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
5656   static const int64_t min_long = 0x8000000000000000;
5657   Label normal_case, special_case;
5658 
5659   // check for special case
5660   cmp64(rax, ExternalAddress((address) &min_long));
5661   jcc(Assembler::notEqual, normal_case);
5662   xorl(rdx, rdx); // prepare rdx for possible special case (where
5663                   // remainder = 0)
5664   cmpq(reg, -1);
5665   jcc(Assembler::equal, special_case);
5666 
5667   // handle normal case
5668   bind(normal_case);
5669   cdqq();
5670   int idivq_offset = offset();
5671   idivq(reg);
5672 
5673   // normal and special case exit
5674   bind(special_case);
5675 
5676   return idivq_offset;
5677 }
5678 
5679 void MacroAssembler::decrementq(Register reg, int value) {
5680   if (value == min_jint) { subq(reg, value); return; }
5681   if (value <  0) { incrementq(reg, -value); return; }
5682   if (value == 0) {                        ; return; }
5683   if (value == 1 && UseIncDec) { decq(reg) ; return; }
5684   /* else */      { subq(reg, value)       ; return; }
5685 }
5686 
5687 void MacroAssembler::decrementq(Address dst, int value) {
5688   if (value == min_jint) { subq(dst, value); return; }
5689   if (value <  0) { incrementq(dst, -value); return; }
5690   if (value == 0) {                        ; return; }
5691   if (value == 1 && UseIncDec) { decq(dst) ; return; }
5692   /* else */      { subq(dst, value)       ; return; }
5693 }
5694 
5695 void MacroAssembler::fat_nop() {
5696   // A 5 byte nop that is safe for patching (see patch_verified_entry)
5697   // Recommened sequence from 'Software Optimization Guide for the AMD
5698   // Hammer Processor'
5699   emit_byte(0x66);
5700   emit_byte(0x66);
5701   emit_byte(0x90);
5702   emit_byte(0x66);
5703   emit_byte(0x90);
5704 }
5705 
5706 void MacroAssembler::incrementq(Register reg, int value) {
5707   if (value == min_jint) { addq(reg, value); return; }
5708   if (value <  0) { decrementq(reg, -value); return; }
5709   if (value == 0) {                        ; return; }
5710   if (value == 1 && UseIncDec) { incq(reg) ; return; }
5711   /* else */      { addq(reg, value)       ; return; }
5712 }
5713 
5714 void MacroAssembler::incrementq(Address dst, int value) {
5715   if (value == min_jint) { addq(dst, value); return; }
5716   if (value <  0) { decrementq(dst, -value); return; }
5717   if (value == 0) {                        ; return; }
5718   if (value == 1 && UseIncDec) { incq(dst) ; return; }
5719   /* else */      { addq(dst, value)       ; return; }
5720 }
5721 
5722 // 32bit can do a case table jump in one instruction but we no longer allow the base
5723 // to be installed in the Address class
5724 void MacroAssembler::jump(ArrayAddress entry) {
5725   lea(rscratch1, entry.base());
5726   Address dispatch = entry.index();
5727   assert(dispatch._base == noreg, "must be");
5728   dispatch._base = rscratch1;
5729   jmp(dispatch);
5730 }
5731 
5732 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
5733   ShouldNotReachHere(); // 64bit doesn't use two regs
5734   cmpq(x_lo, y_lo);
5735 }
5736 
5737 void MacroAssembler::lea(Register dst, AddressLiteral src) {
5738     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
5739 }
5740 
5741 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
5742   mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
5743   movptr(dst, rscratch1);
5744 }
5745 
5746 void MacroAssembler::leave() {
5747   // %%% is this really better? Why not on 32bit too?
5748   emit_byte(0xC9); // LEAVE
5749 }
5750 
5751 void MacroAssembler::lneg(Register hi, Register lo) {
5752   ShouldNotReachHere(); // 64bit doesn't use two regs
5753   negq(lo);
5754 }
5755 
5756 void MacroAssembler::movoop(Register dst, jobject obj) {
5757   mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
5758 }
5759 
5760 void MacroAssembler::movoop(Address dst, jobject obj) {
5761   mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
5762   movq(dst, rscratch1);
5763 }
5764 
5765 void MacroAssembler::movptr(Register dst, AddressLiteral src) {
5766   if (src.is_lval()) {
5767     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
5768   } else {
5769     if (reachable(src)) {
5770       movq(dst, as_Address(src));
5771     } else {
5772       lea(rscratch1, src);
5773       movq(dst, Address(rscratch1,0));
5774     }
5775   }
5776 }
5777 
5778 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
5779   movq(as_Address(dst), src);
5780 }
5781 
5782 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
5783   movq(dst, as_Address(src));
5784 }
5785 
5786 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
5787 void MacroAssembler::movptr(Address dst, intptr_t src) {
5788   mov64(rscratch1, src);
5789   movq(dst, rscratch1);
5790 }
5791 
5792 // These are mostly for initializing NULL
5793 void MacroAssembler::movptr(Address dst, int32_t src) {
5794   movslq(dst, src);
5795 }
5796 
5797 void MacroAssembler::movptr(Register dst, int32_t src) {
5798   mov64(dst, (intptr_t)src);
5799 }
5800 
5801 void MacroAssembler::pushoop(jobject obj) {
5802   movoop(rscratch1, obj);
5803   push(rscratch1);
5804 }
5805 
5806 void MacroAssembler::pushptr(AddressLiteral src) {
5807   lea(rscratch1, src);
5808   if (src.is_lval()) {
5809     push(rscratch1);
5810   } else {
5811     pushq(Address(rscratch1, 0));
5812   }
5813 }
5814 
5815 void MacroAssembler::reset_last_Java_frame(bool clear_fp,
5816                                            bool clear_pc) {
5817   // we must set sp to zero to clear frame
5818   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
5819   // must clear fp, so that compiled frames are not confused; it is
5820   // possible that we need it only for debugging
5821   if (clear_fp) {
5822     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
5823   }
5824 
5825   if (clear_pc) {
5826     movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
5827   }
5828 }
5829 
5830 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
5831                                          Register last_java_fp,
5832                                          address  last_java_pc) {
5833   // determine last_java_sp register
5834   if (!last_java_sp->is_valid()) {
5835     last_java_sp = rsp;
5836   }
5837 
5838   // last_java_fp is optional
5839   if (last_java_fp->is_valid()) {
5840     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
5841            last_java_fp);
5842   }
5843 
5844   // last_java_pc is optional
5845   if (last_java_pc != NULL) {
5846     Address java_pc(r15_thread,
5847                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
5848     lea(rscratch1, InternalAddress(last_java_pc));
5849     movptr(java_pc, rscratch1);
5850   }
5851 
5852   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
5853 }
5854 
5855 static void pass_arg0(MacroAssembler* masm, Register arg) {
5856   if (c_rarg0 != arg ) {
5857     masm->mov(c_rarg0, arg);
5858   }
5859 }
5860 
5861 static void pass_arg1(MacroAssembler* masm, Register arg) {
5862   if (c_rarg1 != arg ) {
5863     masm->mov(c_rarg1, arg);
5864   }
5865 }
5866 
5867 static void pass_arg2(MacroAssembler* masm, Register arg) {
5868   if (c_rarg2 != arg ) {
5869     masm->mov(c_rarg2, arg);
5870   }
5871 }
5872 
5873 static void pass_arg3(MacroAssembler* masm, Register arg) {
5874   if (c_rarg3 != arg ) {
5875     masm->mov(c_rarg3, arg);
5876   }
5877 }
5878 
5879 void MacroAssembler::stop(const char* msg) {
5880   address rip = pc();
5881   pusha(); // get regs on stack
5882   lea(c_rarg0, ExternalAddress((address) msg));
5883   lea(c_rarg1, InternalAddress(rip));
5884   movq(c_rarg2, rsp); // pass pointer to regs array
5885   andq(rsp, -16); // align stack as required by ABI
5886   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
5887   hlt();
5888 }
5889 
5890 void MacroAssembler::warn(const char* msg) {
5891   push(rsp);
5892   andq(rsp, -16);     // align stack as required by push_CPU_state and call
5893 
5894   push_CPU_state();   // keeps alignment at 16 bytes
5895   lea(c_rarg0, ExternalAddress((address) msg));
5896   call_VM_leaf(CAST_FROM_FN_PTR(address, warning), c_rarg0);
5897   pop_CPU_state();
5898   pop(rsp);
5899 }
5900 
5901 #ifndef PRODUCT
5902 extern "C" void findpc(intptr_t x);
5903 #endif
5904 
5905 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
5906   // In order to get locks to work, we need to fake a in_VM state
5907   if (ShowMessageBoxOnError ) {
5908     JavaThread* thread = JavaThread::current();
5909     JavaThreadState saved_state = thread->thread_state();
5910     thread->set_thread_state(_thread_in_vm);
5911 #ifndef PRODUCT
5912     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
5913       ttyLocker ttyl;
5914       BytecodeCounter::print();
5915     }
5916 #endif
5917     // To see where a verify_oop failed, get $ebx+40/X for this frame.
5918     // XXX correct this offset for amd64
5919     // This is the value of eip which points to where verify_oop will return.
5920     if (os::message_box(msg, "Execution stopped, print registers?")) {
5921       ttyLocker ttyl;
5922       tty->print_cr("rip = 0x%016lx", pc);
5923 #ifndef PRODUCT
5924       tty->cr();
5925       findpc(pc);
5926       tty->cr();
5927 #endif
5928       tty->print_cr("rax = 0x%016lx", regs[15]);
5929       tty->print_cr("rbx = 0x%016lx", regs[12]);
5930       tty->print_cr("rcx = 0x%016lx", regs[14]);
5931       tty->print_cr("rdx = 0x%016lx", regs[13]);
5932       tty->print_cr("rdi = 0x%016lx", regs[8]);
5933       tty->print_cr("rsi = 0x%016lx", regs[9]);
5934       tty->print_cr("rbp = 0x%016lx", regs[10]);
5935       tty->print_cr("rsp = 0x%016lx", regs[11]);
5936       tty->print_cr("r8  = 0x%016lx", regs[7]);
5937       tty->print_cr("r9  = 0x%016lx", regs[6]);
5938       tty->print_cr("r10 = 0x%016lx", regs[5]);
5939       tty->print_cr("r11 = 0x%016lx", regs[4]);
5940       tty->print_cr("r12 = 0x%016lx", regs[3]);
5941       tty->print_cr("r13 = 0x%016lx", regs[2]);
5942       tty->print_cr("r14 = 0x%016lx", regs[1]);
5943       tty->print_cr("r15 = 0x%016lx", regs[0]);
5944       BREAKPOINT;
5945     }
5946     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
5947   } else {
5948     ttyLocker ttyl;
5949     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
5950                     msg);
5951     assert(false, err_msg("DEBUG MESSAGE: %s", msg));
5952   }
5953 }
5954 
5955 #endif // _LP64
5956 
5957 // Now versions that are common to 32/64 bit
5958 
5959 void MacroAssembler::addptr(Register dst, int32_t imm32) {
5960   LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
5961 }
5962 
5963 void MacroAssembler::addptr(Register dst, Register src) {
5964   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
5965 }
5966 
5967 void MacroAssembler::addptr(Address dst, Register src) {
5968   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
5969 }
5970 
5971 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
5972   if (reachable(src)) {
5973     Assembler::addsd(dst, as_Address(src));
5974   } else {
5975     lea(rscratch1, src);
5976     Assembler::addsd(dst, Address(rscratch1, 0));
5977   }
5978 }
5979 
5980 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
5981   if (reachable(src)) {
5982     addss(dst, as_Address(src));
5983   } else {
5984     lea(rscratch1, src);
5985     addss(dst, Address(rscratch1, 0));
5986   }
5987 }
5988 
5989 void MacroAssembler::align(int modulus) {
5990   if (offset() % modulus != 0) {
5991     nop(modulus - (offset() % modulus));
5992   }
5993 }
5994 
5995 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src) {
5996   // Used in sign-masking with aligned address.
5997   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
5998   if (reachable(src)) {
5999     Assembler::andpd(dst, as_Address(src));
6000   } else {
6001     lea(rscratch1, src);
6002     Assembler::andpd(dst, Address(rscratch1, 0));
6003   }
6004 }
6005 
6006 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src) {
6007   // Used in sign-masking with aligned address.
6008   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
6009   if (reachable(src)) {
6010     Assembler::andps(dst, as_Address(src));
6011   } else {
6012     lea(rscratch1, src);
6013     Assembler::andps(dst, Address(rscratch1, 0));
6014   }
6015 }
6016 
6017 void MacroAssembler::andptr(Register dst, int32_t imm32) {
6018   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
6019 }
6020 
6021 void MacroAssembler::atomic_incl(AddressLiteral counter_addr) {
6022   pushf();
6023   if (os::is_MP())
6024     lock();
6025   incrementl(counter_addr);
6026   popf();
6027 }
6028 
6029 // Writes to stack successive pages until offset reached to check for
6030 // stack overflow + shadow pages.  This clobbers tmp.
6031 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
6032   movptr(tmp, rsp);
6033   // Bang stack for total size given plus shadow page size.
6034   // Bang one page at a time because large size can bang beyond yellow and
6035   // red zones.
6036   Label loop;
6037   bind(loop);
6038   movl(Address(tmp, (-os::vm_page_size())), size );
6039   subptr(tmp, os::vm_page_size());
6040   subl(size, os::vm_page_size());
6041   jcc(Assembler::greater, loop);
6042 
6043   // Bang down shadow pages too.
6044   // The -1 because we already subtracted 1 page.
6045   for (int i = 0; i< StackShadowPages-1; i++) {
6046     // this could be any sized move but this is can be a debugging crumb
6047     // so the bigger the better.
6048     movptr(Address(tmp, (-i*os::vm_page_size())), size );
6049   }
6050 }
6051 
6052 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
6053   assert(UseBiasedLocking, "why call this otherwise?");
6054 
6055   // Check for biased locking unlock case, which is a no-op
6056   // Note: we do not have to check the thread ID for two reasons.
6057   // First, the interpreter checks for IllegalMonitorStateException at
6058   // a higher level. Second, if the bias was revoked while we held the
6059   // lock, the object could not be rebiased toward another thread, so
6060   // the bias bit would be clear.
6061   movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
6062   andptr(temp_reg, markOopDesc::biased_lock_mask_in_place);
6063   cmpptr(temp_reg, markOopDesc::biased_lock_pattern);
6064   jcc(Assembler::equal, done);
6065 }
6066 
6067 void MacroAssembler::c2bool(Register x) {
6068   // implements x == 0 ? 0 : 1
6069   // note: must only look at least-significant byte of x
6070   //       since C-style booleans are stored in one byte
6071   //       only! (was bug)
6072   andl(x, 0xFF);
6073   setb(Assembler::notZero, x);
6074 }
6075 
6076 // Wouldn't need if AddressLiteral version had new name
6077 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
6078   Assembler::call(L, rtype);
6079 }
6080 
6081 void MacroAssembler::call(Register entry) {
6082   Assembler::call(entry);
6083 }
6084 
6085 void MacroAssembler::call(AddressLiteral entry) {
6086   if (reachable(entry)) {
6087     Assembler::call_literal(entry.target(), entry.rspec());
6088   } else {
6089     lea(rscratch1, entry);
6090     Assembler::call(rscratch1);
6091   }
6092 }
6093 
6094 // Implementation of call_VM versions
6095 
6096 void MacroAssembler::call_VM(Register oop_result,
6097                              address entry_point,
6098                              bool check_exceptions) {
6099   Label C, E;
6100   call(C, relocInfo::none);
6101   jmp(E);
6102 
6103   bind(C);
6104   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
6105   ret(0);
6106 
6107   bind(E);
6108 }
6109 
6110 void MacroAssembler::call_VM(Register oop_result,
6111                              address entry_point,
6112                              Register arg_1,
6113                              bool check_exceptions) {
6114   Label C, E;
6115   call(C, relocInfo::none);
6116   jmp(E);
6117 
6118   bind(C);
6119   pass_arg1(this, arg_1);
6120   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
6121   ret(0);
6122 
6123   bind(E);
6124 }
6125 
6126 void MacroAssembler::call_VM(Register oop_result,
6127                              address entry_point,
6128                              Register arg_1,
6129                              Register arg_2,
6130                              bool check_exceptions) {
6131   Label C, E;
6132   call(C, relocInfo::none);
6133   jmp(E);
6134 
6135   bind(C);
6136 
6137   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6138 
6139   pass_arg2(this, arg_2);
6140   pass_arg1(this, arg_1);
6141   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
6142   ret(0);
6143 
6144   bind(E);
6145 }
6146 
6147 void MacroAssembler::call_VM(Register oop_result,
6148                              address entry_point,
6149                              Register arg_1,
6150                              Register arg_2,
6151                              Register arg_3,
6152                              bool check_exceptions) {
6153   Label C, E;
6154   call(C, relocInfo::none);
6155   jmp(E);
6156 
6157   bind(C);
6158 
6159   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
6160   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
6161   pass_arg3(this, arg_3);
6162 
6163   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6164   pass_arg2(this, arg_2);
6165 
6166   pass_arg1(this, arg_1);
6167   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
6168   ret(0);
6169 
6170   bind(E);
6171 }
6172 
6173 void MacroAssembler::call_VM(Register oop_result,
6174                              Register last_java_sp,
6175                              address entry_point,
6176                              int number_of_arguments,
6177                              bool check_exceptions) {
6178   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
6179   call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
6180 }
6181 
6182 void MacroAssembler::call_VM(Register oop_result,
6183                              Register last_java_sp,
6184                              address entry_point,
6185                              Register arg_1,
6186                              bool check_exceptions) {
6187   pass_arg1(this, arg_1);
6188   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
6189 }
6190 
6191 void MacroAssembler::call_VM(Register oop_result,
6192                              Register last_java_sp,
6193                              address entry_point,
6194                              Register arg_1,
6195                              Register arg_2,
6196                              bool check_exceptions) {
6197 
6198   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6199   pass_arg2(this, arg_2);
6200   pass_arg1(this, arg_1);
6201   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
6202 }
6203 
6204 void MacroAssembler::call_VM(Register oop_result,
6205                              Register last_java_sp,
6206                              address entry_point,
6207                              Register arg_1,
6208                              Register arg_2,
6209                              Register arg_3,
6210                              bool check_exceptions) {
6211   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
6212   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
6213   pass_arg3(this, arg_3);
6214   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6215   pass_arg2(this, arg_2);
6216   pass_arg1(this, arg_1);
6217   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
6218 }
6219 
6220 void MacroAssembler::super_call_VM(Register oop_result,
6221                                    Register last_java_sp,
6222                                    address entry_point,
6223                                    int number_of_arguments,
6224                                    bool check_exceptions) {
6225   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
6226   MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
6227 }
6228 
6229 void MacroAssembler::super_call_VM(Register oop_result,
6230                                    Register last_java_sp,
6231                                    address entry_point,
6232                                    Register arg_1,
6233                                    bool check_exceptions) {
6234   pass_arg1(this, arg_1);
6235   super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
6236 }
6237 
6238 void MacroAssembler::super_call_VM(Register oop_result,
6239                                    Register last_java_sp,
6240                                    address entry_point,
6241                                    Register arg_1,
6242                                    Register arg_2,
6243                                    bool check_exceptions) {
6244 
6245   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6246   pass_arg2(this, arg_2);
6247   pass_arg1(this, arg_1);
6248   super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
6249 }
6250 
6251 void MacroAssembler::super_call_VM(Register oop_result,
6252                                    Register last_java_sp,
6253                                    address entry_point,
6254                                    Register arg_1,
6255                                    Register arg_2,
6256                                    Register arg_3,
6257                                    bool check_exceptions) {
6258   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
6259   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
6260   pass_arg3(this, arg_3);
6261   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6262   pass_arg2(this, arg_2);
6263   pass_arg1(this, arg_1);
6264   super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
6265 }
6266 
6267 void MacroAssembler::call_VM_base(Register oop_result,
6268                                   Register java_thread,
6269                                   Register last_java_sp,
6270                                   address  entry_point,
6271                                   int      number_of_arguments,
6272                                   bool     check_exceptions) {
6273   // determine java_thread register
6274   if (!java_thread->is_valid()) {
6275 #ifdef _LP64
6276     java_thread = r15_thread;
6277 #else
6278     java_thread = rdi;
6279     get_thread(java_thread);
6280 #endif // LP64
6281   }
6282   // determine last_java_sp register
6283   if (!last_java_sp->is_valid()) {
6284     last_java_sp = rsp;
6285   }
6286   // debugging support
6287   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
6288   LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
6289 #ifdef ASSERT
6290   LP64_ONLY(if (UseCompressedOops) verify_heapbase("call_VM_base");)
6291 #endif // ASSERT
6292 
6293   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
6294   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
6295 
6296   // push java thread (becomes first argument of C function)
6297 
6298   NOT_LP64(push(java_thread); number_of_arguments++);
6299   LP64_ONLY(mov(c_rarg0, r15_thread));
6300 
6301   // set last Java frame before call
6302   assert(last_java_sp != rbp, "can't use ebp/rbp");
6303 
6304   // Only interpreter should have to set fp
6305   set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
6306 
6307   // do the call, remove parameters
6308   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
6309 
6310   // restore the thread (cannot use the pushed argument since arguments
6311   // may be overwritten by C code generated by an optimizing compiler);
6312   // however can use the register value directly if it is callee saved.
6313   if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
6314     // rdi & rsi (also r15) are callee saved -> nothing to do
6315 #ifdef ASSERT
6316     guarantee(java_thread != rax, "change this code");
6317     push(rax);
6318     { Label L;
6319       get_thread(rax);
6320       cmpptr(java_thread, rax);
6321       jcc(Assembler::equal, L);
6322       stop("MacroAssembler::call_VM_base: rdi not callee saved?");
6323       bind(L);
6324     }
6325     pop(rax);
6326 #endif
6327   } else {
6328     get_thread(java_thread);
6329   }
6330   // reset last Java frame
6331   // Only interpreter should have to clear fp
6332   reset_last_Java_frame(java_thread, true, false);
6333 
6334 #ifndef CC_INTERP
6335    // C++ interp handles this in the interpreter
6336   check_and_handle_popframe(java_thread);
6337   check_and_handle_earlyret(java_thread);
6338 #endif /* CC_INTERP */
6339 
6340   if (check_exceptions) {
6341     // check for pending exceptions (java_thread is set upon return)
6342     cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
6343 #ifndef _LP64
6344     jump_cc(Assembler::notEqual,
6345             RuntimeAddress(StubRoutines::forward_exception_entry()));
6346 #else
6347     // This used to conditionally jump to forward_exception however it is
6348     // possible if we relocate that the branch will not reach. So we must jump
6349     // around so we can always reach
6350 
6351     Label ok;
6352     jcc(Assembler::equal, ok);
6353     jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
6354     bind(ok);
6355 #endif // LP64
6356   }
6357 
6358   // get oop result if there is one and reset the value in the thread
6359   if (oop_result->is_valid()) {
6360     movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
6361     movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
6362     verify_oop(oop_result, "broken oop in call_VM_base");
6363   }
6364 }
6365 
6366 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
6367 
6368   // Calculate the value for last_Java_sp
6369   // somewhat subtle. call_VM does an intermediate call
6370   // which places a return address on the stack just under the
6371   // stack pointer as the user finsihed with it. This allows
6372   // use to retrieve last_Java_pc from last_Java_sp[-1].
6373   // On 32bit we then have to push additional args on the stack to accomplish
6374   // the actual requested call. On 64bit call_VM only can use register args
6375   // so the only extra space is the return address that call_VM created.
6376   // This hopefully explains the calculations here.
6377 
6378 #ifdef _LP64
6379   // We've pushed one address, correct last_Java_sp
6380   lea(rax, Address(rsp, wordSize));
6381 #else
6382   lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
6383 #endif // LP64
6384 
6385   call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
6386 
6387 }
6388 
6389 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
6390   call_VM_leaf_base(entry_point, number_of_arguments);
6391 }
6392 
6393 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
6394   pass_arg0(this, arg_0);
6395   call_VM_leaf(entry_point, 1);
6396 }
6397 
6398 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
6399 
6400   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
6401   pass_arg1(this, arg_1);
6402   pass_arg0(this, arg_0);
6403   call_VM_leaf(entry_point, 2);
6404 }
6405 
6406 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
6407   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
6408   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6409   pass_arg2(this, arg_2);
6410   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
6411   pass_arg1(this, arg_1);
6412   pass_arg0(this, arg_0);
6413   call_VM_leaf(entry_point, 3);
6414 }
6415 
6416 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
6417   pass_arg0(this, arg_0);
6418   MacroAssembler::call_VM_leaf_base(entry_point, 1);
6419 }
6420 
6421 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
6422 
6423   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
6424   pass_arg1(this, arg_1);
6425   pass_arg0(this, arg_0);
6426   MacroAssembler::call_VM_leaf_base(entry_point, 2);
6427 }
6428 
6429 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
6430   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
6431   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6432   pass_arg2(this, arg_2);
6433   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
6434   pass_arg1(this, arg_1);
6435   pass_arg0(this, arg_0);
6436   MacroAssembler::call_VM_leaf_base(entry_point, 3);
6437 }
6438 
6439 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
6440   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
6441   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
6442   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
6443   pass_arg3(this, arg_3);
6444   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
6445   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6446   pass_arg2(this, arg_2);
6447   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
6448   pass_arg1(this, arg_1);
6449   pass_arg0(this, arg_0);
6450   MacroAssembler::call_VM_leaf_base(entry_point, 4);
6451 }
6452 
6453 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
6454 }
6455 
6456 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
6457 }
6458 
6459 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
6460   if (reachable(src1)) {
6461     cmpl(as_Address(src1), imm);
6462   } else {
6463     lea(rscratch1, src1);
6464     cmpl(Address(rscratch1, 0), imm);
6465   }
6466 }
6467 
6468 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
6469   assert(!src2.is_lval(), "use cmpptr");
6470   if (reachable(src2)) {
6471     cmpl(src1, as_Address(src2));
6472   } else {
6473     lea(rscratch1, src2);
6474     cmpl(src1, Address(rscratch1, 0));
6475   }
6476 }
6477 
6478 void MacroAssembler::cmp32(Register src1, int32_t imm) {
6479   Assembler::cmpl(src1, imm);
6480 }
6481 
6482 void MacroAssembler::cmp32(Register src1, Address src2) {
6483   Assembler::cmpl(src1, src2);
6484 }
6485 
6486 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
6487   ucomisd(opr1, opr2);
6488 
6489   Label L;
6490   if (unordered_is_less) {
6491     movl(dst, -1);
6492     jcc(Assembler::parity, L);
6493     jcc(Assembler::below , L);
6494     movl(dst, 0);
6495     jcc(Assembler::equal , L);
6496     increment(dst);
6497   } else { // unordered is greater
6498     movl(dst, 1);
6499     jcc(Assembler::parity, L);
6500     jcc(Assembler::above , L);
6501     movl(dst, 0);
6502     jcc(Assembler::equal , L);
6503     decrementl(dst);
6504   }
6505   bind(L);
6506 }
6507 
6508 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
6509   ucomiss(opr1, opr2);
6510 
6511   Label L;
6512   if (unordered_is_less) {
6513     movl(dst, -1);
6514     jcc(Assembler::parity, L);
6515     jcc(Assembler::below , L);
6516     movl(dst, 0);
6517     jcc(Assembler::equal , L);
6518     increment(dst);
6519   } else { // unordered is greater
6520     movl(dst, 1);
6521     jcc(Assembler::parity, L);
6522     jcc(Assembler::above , L);
6523     movl(dst, 0);
6524     jcc(Assembler::equal , L);
6525     decrementl(dst);
6526   }
6527   bind(L);
6528 }
6529 
6530 
6531 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
6532   if (reachable(src1)) {
6533     cmpb(as_Address(src1), imm);
6534   } else {
6535     lea(rscratch1, src1);
6536     cmpb(Address(rscratch1, 0), imm);
6537   }
6538 }
6539 
6540 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
6541 #ifdef _LP64
6542   if (src2.is_lval()) {
6543     movptr(rscratch1, src2);
6544     Assembler::cmpq(src1, rscratch1);
6545   } else if (reachable(src2)) {
6546     cmpq(src1, as_Address(src2));
6547   } else {
6548     lea(rscratch1, src2);
6549     Assembler::cmpq(src1, Address(rscratch1, 0));
6550   }
6551 #else
6552   if (src2.is_lval()) {
6553     cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
6554   } else {
6555     cmpl(src1, as_Address(src2));
6556   }
6557 #endif // _LP64
6558 }
6559 
6560 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
6561   assert(src2.is_lval(), "not a mem-mem compare");
6562 #ifdef _LP64
6563   // moves src2's literal address
6564   movptr(rscratch1, src2);
6565   Assembler::cmpq(src1, rscratch1);
6566 #else
6567   cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
6568 #endif // _LP64
6569 }
6570 
6571 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
6572   if (reachable(adr)) {
6573     if (os::is_MP())
6574       lock();
6575     cmpxchgptr(reg, as_Address(adr));
6576   } else {
6577     lea(rscratch1, adr);
6578     if (os::is_MP())
6579       lock();
6580     cmpxchgptr(reg, Address(rscratch1, 0));
6581   }
6582 }
6583 
6584 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
6585   LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
6586 }
6587 
6588 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
6589   if (reachable(src)) {
6590     Assembler::comisd(dst, as_Address(src));
6591   } else {
6592     lea(rscratch1, src);
6593     Assembler::comisd(dst, Address(rscratch1, 0));
6594   }
6595 }
6596 
6597 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
6598   if (reachable(src)) {
6599     Assembler::comiss(dst, as_Address(src));
6600   } else {
6601     lea(rscratch1, src);
6602     Assembler::comiss(dst, Address(rscratch1, 0));
6603   }
6604 }
6605 
6606 
6607 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
6608   Condition negated_cond = negate_condition(cond);
6609   Label L;
6610   jcc(negated_cond, L);
6611   atomic_incl(counter_addr);
6612   bind(L);
6613 }
6614 
6615 int MacroAssembler::corrected_idivl(Register reg) {
6616   // Full implementation of Java idiv and irem; checks for
6617   // special case as described in JVM spec., p.243 & p.271.
6618   // The function returns the (pc) offset of the idivl
6619   // instruction - may be needed for implicit exceptions.
6620   //
6621   //         normal case                           special case
6622   //
6623   // input : rax,: dividend                         min_int
6624   //         reg: divisor   (may not be rax,/rdx)   -1
6625   //
6626   // output: rax,: quotient  (= rax, idiv reg)       min_int
6627   //         rdx: remainder (= rax, irem reg)       0
6628   assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
6629   const int min_int = 0x80000000;
6630   Label normal_case, special_case;
6631 
6632   // check for special case
6633   cmpl(rax, min_int);
6634   jcc(Assembler::notEqual, normal_case);
6635   xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
6636   cmpl(reg, -1);
6637   jcc(Assembler::equal, special_case);
6638 
6639   // handle normal case
6640   bind(normal_case);
6641   cdql();
6642   int idivl_offset = offset();
6643   idivl(reg);
6644 
6645   // normal and special case exit
6646   bind(special_case);
6647 
6648   return idivl_offset;
6649 }
6650 
6651 
6652 
6653 void MacroAssembler::decrementl(Register reg, int value) {
6654   if (value == min_jint) {subl(reg, value) ; return; }
6655   if (value <  0) { incrementl(reg, -value); return; }
6656   if (value == 0) {                        ; return; }
6657   if (value == 1 && UseIncDec) { decl(reg) ; return; }
6658   /* else */      { subl(reg, value)       ; return; }
6659 }
6660 
6661 void MacroAssembler::decrementl(Address dst, int value) {
6662   if (value == min_jint) {subl(dst, value) ; return; }
6663   if (value <  0) { incrementl(dst, -value); return; }
6664   if (value == 0) {                        ; return; }
6665   if (value == 1 && UseIncDec) { decl(dst) ; return; }
6666   /* else */      { subl(dst, value)       ; return; }
6667 }
6668 
6669 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
6670   assert (shift_value > 0, "illegal shift value");
6671   Label _is_positive;
6672   testl (reg, reg);
6673   jcc (Assembler::positive, _is_positive);
6674   int offset = (1 << shift_value) - 1 ;
6675 
6676   if (offset == 1) {
6677     incrementl(reg);
6678   } else {
6679     addl(reg, offset);
6680   }
6681 
6682   bind (_is_positive);
6683   sarl(reg, shift_value);
6684 }
6685 
6686 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
6687   if (reachable(src)) {
6688     Assembler::divsd(dst, as_Address(src));
6689   } else {
6690     lea(rscratch1, src);
6691     Assembler::divsd(dst, Address(rscratch1, 0));
6692   }
6693 }
6694 
6695 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
6696   if (reachable(src)) {
6697     Assembler::divss(dst, as_Address(src));
6698   } else {
6699     lea(rscratch1, src);
6700     Assembler::divss(dst, Address(rscratch1, 0));
6701   }
6702 }
6703 
6704 // !defined(COMPILER2) is because of stupid core builds
6705 #if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2)
6706 void MacroAssembler::empty_FPU_stack() {
6707   if (VM_Version::supports_mmx()) {
6708     emms();
6709   } else {
6710     for (int i = 8; i-- > 0; ) ffree(i);
6711   }
6712 }
6713 #endif // !LP64 || C1 || !C2
6714 
6715 
6716 // Defines obj, preserves var_size_in_bytes
6717 void MacroAssembler::eden_allocate(Register obj,
6718                                    Register var_size_in_bytes,
6719                                    int con_size_in_bytes,
6720                                    Register t1,
6721                                    Label& slow_case) {
6722   assert(obj == rax, "obj must be in rax, for cmpxchg");
6723   assert_different_registers(obj, var_size_in_bytes, t1);
6724   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
6725     jmp(slow_case);
6726   } else {
6727     Register end = t1;
6728     Label retry;
6729     bind(retry);
6730     ExternalAddress heap_top((address) Universe::heap()->top_addr());
6731     movptr(obj, heap_top);
6732     if (var_size_in_bytes == noreg) {
6733       lea(end, Address(obj, con_size_in_bytes));
6734     } else {
6735       lea(end, Address(obj, var_size_in_bytes, Address::times_1));
6736     }
6737     // if end < obj then we wrapped around => object too long => slow case
6738     cmpptr(end, obj);
6739     jcc(Assembler::below, slow_case);
6740     cmpptr(end, ExternalAddress((address) Universe::heap()->end_addr()));
6741     jcc(Assembler::above, slow_case);
6742     // Compare obj with the top addr, and if still equal, store the new top addr in
6743     // end at the address of the top addr pointer. Sets ZF if was equal, and clears
6744     // it otherwise. Use lock prefix for atomicity on MPs.
6745     locked_cmpxchgptr(end, heap_top);
6746     jcc(Assembler::notEqual, retry);
6747   }
6748 }
6749 
6750 void MacroAssembler::enter() {
6751   push(rbp);
6752   mov(rbp, rsp);
6753 }
6754 
6755 void MacroAssembler::fcmp(Register tmp) {
6756   fcmp(tmp, 1, true, true);
6757 }
6758 
6759 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
6760   assert(!pop_right || pop_left, "usage error");
6761   if (VM_Version::supports_cmov()) {
6762     assert(tmp == noreg, "unneeded temp");
6763     if (pop_left) {
6764       fucomip(index);
6765     } else {
6766       fucomi(index);
6767     }
6768     if (pop_right) {
6769       fpop();
6770     }
6771   } else {
6772     assert(tmp != noreg, "need temp");
6773     if (pop_left) {
6774       if (pop_right) {
6775         fcompp();
6776       } else {
6777         fcomp(index);
6778       }
6779     } else {
6780       fcom(index);
6781     }
6782     // convert FPU condition into eflags condition via rax,
6783     save_rax(tmp);
6784     fwait(); fnstsw_ax();
6785     sahf();
6786     restore_rax(tmp);
6787   }
6788   // condition codes set as follows:
6789   //
6790   // CF (corresponds to C0) if x < y
6791   // PF (corresponds to C2) if unordered
6792   // ZF (corresponds to C3) if x = y
6793 }
6794 
6795 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
6796   fcmp2int(dst, unordered_is_less, 1, true, true);
6797 }
6798 
6799 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
6800   fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
6801   Label L;
6802   if (unordered_is_less) {
6803     movl(dst, -1);
6804     jcc(Assembler::parity, L);
6805     jcc(Assembler::below , L);
6806     movl(dst, 0);
6807     jcc(Assembler::equal , L);
6808     increment(dst);
6809   } else { // unordered is greater
6810     movl(dst, 1);
6811     jcc(Assembler::parity, L);
6812     jcc(Assembler::above , L);
6813     movl(dst, 0);
6814     jcc(Assembler::equal , L);
6815     decrementl(dst);
6816   }
6817   bind(L);
6818 }
6819 
6820 void MacroAssembler::fld_d(AddressLiteral src) {
6821   fld_d(as_Address(src));
6822 }
6823 
6824 void MacroAssembler::fld_s(AddressLiteral src) {
6825   fld_s(as_Address(src));
6826 }
6827 
6828 void MacroAssembler::fld_x(AddressLiteral src) {
6829   Assembler::fld_x(as_Address(src));
6830 }
6831 
6832 void MacroAssembler::fldcw(AddressLiteral src) {
6833   Assembler::fldcw(as_Address(src));
6834 }
6835 
6836 void MacroAssembler::fpop() {
6837   ffree();
6838   fincstp();
6839 }
6840 
6841 void MacroAssembler::fremr(Register tmp) {
6842   save_rax(tmp);
6843   { Label L;
6844     bind(L);
6845     fprem();
6846     fwait(); fnstsw_ax();
6847 #ifdef _LP64
6848     testl(rax, 0x400);
6849     jcc(Assembler::notEqual, L);
6850 #else
6851     sahf();
6852     jcc(Assembler::parity, L);
6853 #endif // _LP64
6854   }
6855   restore_rax(tmp);
6856   // Result is in ST0.
6857   // Note: fxch & fpop to get rid of ST1
6858   // (otherwise FPU stack could overflow eventually)
6859   fxch(1);
6860   fpop();
6861 }
6862 
6863 
6864 void MacroAssembler::incrementl(AddressLiteral dst) {
6865   if (reachable(dst)) {
6866     incrementl(as_Address(dst));
6867   } else {
6868     lea(rscratch1, dst);
6869     incrementl(Address(rscratch1, 0));
6870   }
6871 }
6872 
6873 void MacroAssembler::incrementl(ArrayAddress dst) {
6874   incrementl(as_Address(dst));
6875 }
6876 
6877 void MacroAssembler::incrementl(Register reg, int value) {
6878   if (value == min_jint) {addl(reg, value) ; return; }
6879   if (value <  0) { decrementl(reg, -value); return; }
6880   if (value == 0) {                        ; return; }
6881   if (value == 1 && UseIncDec) { incl(reg) ; return; }
6882   /* else */      { addl(reg, value)       ; return; }
6883 }
6884 
6885 void MacroAssembler::incrementl(Address dst, int value) {
6886   if (value == min_jint) {addl(dst, value) ; return; }
6887   if (value <  0) { decrementl(dst, -value); return; }
6888   if (value == 0) {                        ; return; }
6889   if (value == 1 && UseIncDec) { incl(dst) ; return; }
6890   /* else */      { addl(dst, value)       ; return; }
6891 }
6892 
6893 void MacroAssembler::jump(AddressLiteral dst) {
6894   if (reachable(dst)) {
6895     jmp_literal(dst.target(), dst.rspec());
6896   } else {
6897     lea(rscratch1, dst);
6898     jmp(rscratch1);
6899   }
6900 }
6901 
6902 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
6903   if (reachable(dst)) {
6904     InstructionMark im(this);
6905     relocate(dst.reloc());
6906     const int short_size = 2;
6907     const int long_size = 6;
6908     int offs = (intptr_t)dst.target() - ((intptr_t)_code_pos);
6909     if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
6910       // 0111 tttn #8-bit disp
6911       emit_byte(0x70 | cc);
6912       emit_byte((offs - short_size) & 0xFF);
6913     } else {
6914       // 0000 1111 1000 tttn #32-bit disp
6915       emit_byte(0x0F);
6916       emit_byte(0x80 | cc);
6917       emit_long(offs - long_size);
6918     }
6919   } else {
6920 #ifdef ASSERT
6921     warning("reversing conditional branch");
6922 #endif /* ASSERT */
6923     Label skip;
6924     jccb(reverse[cc], skip);
6925     lea(rscratch1, dst);
6926     Assembler::jmp(rscratch1);
6927     bind(skip);
6928   }
6929 }
6930 
6931 void MacroAssembler::ldmxcsr(AddressLiteral src) {
6932   if (reachable(src)) {
6933     Assembler::ldmxcsr(as_Address(src));
6934   } else {
6935     lea(rscratch1, src);
6936     Assembler::ldmxcsr(Address(rscratch1, 0));
6937   }
6938 }
6939 
6940 int MacroAssembler::load_signed_byte(Register dst, Address src) {
6941   int off;
6942   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
6943     off = offset();
6944     movsbl(dst, src); // movsxb
6945   } else {
6946     off = load_unsigned_byte(dst, src);
6947     shll(dst, 24);
6948     sarl(dst, 24);
6949   }
6950   return off;
6951 }
6952 
6953 // Note: load_signed_short used to be called load_signed_word.
6954 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
6955 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
6956 // The term "word" in HotSpot means a 32- or 64-bit machine word.
6957 int MacroAssembler::load_signed_short(Register dst, Address src) {
6958   int off;
6959   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
6960     // This is dubious to me since it seems safe to do a signed 16 => 64 bit
6961     // version but this is what 64bit has always done. This seems to imply
6962     // that users are only using 32bits worth.
6963     off = offset();
6964     movswl(dst, src); // movsxw
6965   } else {
6966     off = load_unsigned_short(dst, src);
6967     shll(dst, 16);
6968     sarl(dst, 16);
6969   }
6970   return off;
6971 }
6972 
6973 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
6974   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
6975   // and "3.9 Partial Register Penalties", p. 22).
6976   int off;
6977   if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
6978     off = offset();
6979     movzbl(dst, src); // movzxb
6980   } else {
6981     xorl(dst, dst);
6982     off = offset();
6983     movb(dst, src);
6984   }
6985   return off;
6986 }
6987 
6988 // Note: load_unsigned_short used to be called load_unsigned_word.
6989 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
6990   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
6991   // and "3.9 Partial Register Penalties", p. 22).
6992   int off;
6993   if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
6994     off = offset();
6995     movzwl(dst, src); // movzxw
6996   } else {
6997     xorl(dst, dst);
6998     off = offset();
6999     movw(dst, src);
7000   }
7001   return off;
7002 }
7003 
7004 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
7005   switch (size_in_bytes) {
7006 #ifndef _LP64
7007   case  8:
7008     assert(dst2 != noreg, "second dest register required");
7009     movl(dst,  src);
7010     movl(dst2, src.plus_disp(BytesPerInt));
7011     break;
7012 #else
7013   case  8:  movq(dst, src); break;
7014 #endif
7015   case  4:  movl(dst, src); break;
7016   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
7017   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
7018   default:  ShouldNotReachHere();
7019   }
7020 }
7021 
7022 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
7023   switch (size_in_bytes) {
7024 #ifndef _LP64
7025   case  8:
7026     assert(src2 != noreg, "second source register required");
7027     movl(dst,                        src);
7028     movl(dst.plus_disp(BytesPerInt), src2);
7029     break;
7030 #else
7031   case  8:  movq(dst, src); break;
7032 #endif
7033   case  4:  movl(dst, src); break;
7034   case  2:  movw(dst, src); break;
7035   case  1:  movb(dst, src); break;
7036   default:  ShouldNotReachHere();
7037   }
7038 }
7039 
7040 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
7041   if (reachable(dst)) {
7042     movl(as_Address(dst), src);
7043   } else {
7044     lea(rscratch1, dst);
7045     movl(Address(rscratch1, 0), src);
7046   }
7047 }
7048 
7049 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
7050   if (reachable(src)) {
7051     movl(dst, as_Address(src));
7052   } else {
7053     lea(rscratch1, src);
7054     movl(dst, Address(rscratch1, 0));
7055   }
7056 }
7057 
7058 // C++ bool manipulation
7059 
7060 void MacroAssembler::movbool(Register dst, Address src) {
7061   if(sizeof(bool) == 1)
7062     movb(dst, src);
7063   else if(sizeof(bool) == 2)
7064     movw(dst, src);
7065   else if(sizeof(bool) == 4)
7066     movl(dst, src);
7067   else
7068     // unsupported
7069     ShouldNotReachHere();
7070 }
7071 
7072 void MacroAssembler::movbool(Address dst, bool boolconst) {
7073   if(sizeof(bool) == 1)
7074     movb(dst, (int) boolconst);
7075   else if(sizeof(bool) == 2)
7076     movw(dst, (int) boolconst);
7077   else if(sizeof(bool) == 4)
7078     movl(dst, (int) boolconst);
7079   else
7080     // unsupported
7081     ShouldNotReachHere();
7082 }
7083 
7084 void MacroAssembler::movbool(Address dst, Register src) {
7085   if(sizeof(bool) == 1)
7086     movb(dst, src);
7087   else if(sizeof(bool) == 2)
7088     movw(dst, src);
7089   else if(sizeof(bool) == 4)
7090     movl(dst, src);
7091   else
7092     // unsupported
7093     ShouldNotReachHere();
7094 }
7095 
7096 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
7097   movb(as_Address(dst), src);
7098 }
7099 
7100 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
7101   if (reachable(src)) {
7102     if (UseXmmLoadAndClearUpper) {
7103       movsd (dst, as_Address(src));
7104     } else {
7105       movlpd(dst, as_Address(src));
7106     }
7107   } else {
7108     lea(rscratch1, src);
7109     if (UseXmmLoadAndClearUpper) {
7110       movsd (dst, Address(rscratch1, 0));
7111     } else {
7112       movlpd(dst, Address(rscratch1, 0));
7113     }
7114   }
7115 }
7116 
7117 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
7118   if (reachable(src)) {
7119     movss(dst, as_Address(src));
7120   } else {
7121     lea(rscratch1, src);
7122     movss(dst, Address(rscratch1, 0));
7123   }
7124 }
7125 
7126 void MacroAssembler::movptr(Register dst, Register src) {
7127   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
7128 }
7129 
7130 void MacroAssembler::movptr(Register dst, Address src) {
7131   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
7132 }
7133 
7134 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
7135 void MacroAssembler::movptr(Register dst, intptr_t src) {
7136   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
7137 }
7138 
7139 void MacroAssembler::movptr(Address dst, Register src) {
7140   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
7141 }
7142 
7143 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
7144   if (reachable(src)) {
7145     Assembler::movsd(dst, as_Address(src));
7146   } else {
7147     lea(rscratch1, src);
7148     Assembler::movsd(dst, Address(rscratch1, 0));
7149   }
7150 }
7151 
7152 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
7153   if (reachable(src)) {
7154     Assembler::movss(dst, as_Address(src));
7155   } else {
7156     lea(rscratch1, src);
7157     Assembler::movss(dst, Address(rscratch1, 0));
7158   }
7159 }
7160 
7161 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
7162   if (reachable(src)) {
7163     Assembler::mulsd(dst, as_Address(src));
7164   } else {
7165     lea(rscratch1, src);
7166     Assembler::mulsd(dst, Address(rscratch1, 0));
7167   }
7168 }
7169 
7170 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
7171   if (reachable(src)) {
7172     Assembler::mulss(dst, as_Address(src));
7173   } else {
7174     lea(rscratch1, src);
7175     Assembler::mulss(dst, Address(rscratch1, 0));
7176   }
7177 }
7178 
7179 void MacroAssembler::null_check(Register reg, int offset) {
7180   if (needs_explicit_null_check(offset)) {
7181     // provoke OS NULL exception if reg = NULL by
7182     // accessing M[reg] w/o changing any (non-CC) registers
7183     // NOTE: cmpl is plenty here to provoke a segv
7184     cmpptr(rax, Address(reg, 0));
7185     // Note: should probably use testl(rax, Address(reg, 0));
7186     //       may be shorter code (however, this version of
7187     //       testl needs to be implemented first)
7188   } else {
7189     // nothing to do, (later) access of M[reg + offset]
7190     // will provoke OS NULL exception if reg = NULL
7191   }
7192 }
7193 
7194 void MacroAssembler::os_breakpoint() {
7195   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
7196   // (e.g., MSVC can't call ps() otherwise)
7197   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
7198 }
7199 
7200 void MacroAssembler::pop_CPU_state() {
7201   pop_FPU_state();
7202   pop_IU_state();
7203 }
7204 
7205 void MacroAssembler::pop_FPU_state() {
7206   NOT_LP64(frstor(Address(rsp, 0));)
7207   LP64_ONLY(fxrstor(Address(rsp, 0));)
7208   addptr(rsp, FPUStateSizeInWords * wordSize);
7209 }
7210 
7211 void MacroAssembler::pop_IU_state() {
7212   popa();
7213   LP64_ONLY(addq(rsp, 8));
7214   popf();
7215 }
7216 
7217 // Save Integer and Float state
7218 // Warning: Stack must be 16 byte aligned (64bit)
7219 void MacroAssembler::push_CPU_state() {
7220   push_IU_state();
7221   push_FPU_state();
7222 }
7223 
7224 void MacroAssembler::push_FPU_state() {
7225   subptr(rsp, FPUStateSizeInWords * wordSize);
7226 #ifndef _LP64
7227   fnsave(Address(rsp, 0));
7228   fwait();
7229 #else
7230   fxsave(Address(rsp, 0));
7231 #endif // LP64
7232 }
7233 
7234 void MacroAssembler::push_IU_state() {
7235   // Push flags first because pusha kills them
7236   pushf();
7237   // Make sure rsp stays 16-byte aligned
7238   LP64_ONLY(subq(rsp, 8));
7239   pusha();
7240 }
7241 
7242 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp, bool clear_pc) {
7243   // determine java_thread register
7244   if (!java_thread->is_valid()) {
7245     java_thread = rdi;
7246     get_thread(java_thread);
7247   }
7248   // we must set sp to zero to clear frame
7249   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
7250   if (clear_fp) {
7251     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
7252   }
7253 
7254   if (clear_pc)
7255     movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
7256 
7257 }
7258 
7259 void MacroAssembler::restore_rax(Register tmp) {
7260   if (tmp == noreg) pop(rax);
7261   else if (tmp != rax) mov(rax, tmp);
7262 }
7263 
7264 void MacroAssembler::round_to(Register reg, int modulus) {
7265   addptr(reg, modulus - 1);
7266   andptr(reg, -modulus);
7267 }
7268 
7269 void MacroAssembler::save_rax(Register tmp) {
7270   if (tmp == noreg) push(rax);
7271   else if (tmp != rax) mov(tmp, rax);
7272 }
7273 
7274 // Write serialization page so VM thread can do a pseudo remote membar.
7275 // We use the current thread pointer to calculate a thread specific
7276 // offset to write to within the page. This minimizes bus traffic
7277 // due to cache line collision.
7278 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
7279   movl(tmp, thread);
7280   shrl(tmp, os::get_serialize_page_shift_count());
7281   andl(tmp, (os::vm_page_size() - sizeof(int)));
7282 
7283   Address index(noreg, tmp, Address::times_1);
7284   ExternalAddress page(os::get_memory_serialize_page());
7285 
7286   // Size of store must match masking code above
7287   movl(as_Address(ArrayAddress(page, index)), tmp);
7288 }
7289 
7290 // Calls to C land
7291 //
7292 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
7293 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
7294 // has to be reset to 0. This is required to allow proper stack traversal.
7295 void MacroAssembler::set_last_Java_frame(Register java_thread,
7296                                          Register last_java_sp,
7297                                          Register last_java_fp,
7298                                          address  last_java_pc) {
7299   // determine java_thread register
7300   if (!java_thread->is_valid()) {
7301     java_thread = rdi;
7302     get_thread(java_thread);
7303   }
7304   // determine last_java_sp register
7305   if (!last_java_sp->is_valid()) {
7306     last_java_sp = rsp;
7307   }
7308 
7309   // last_java_fp is optional
7310 
7311   if (last_java_fp->is_valid()) {
7312     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
7313   }
7314 
7315   // last_java_pc is optional
7316 
7317   if (last_java_pc != NULL) {
7318     lea(Address(java_thread,
7319                  JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
7320         InternalAddress(last_java_pc));
7321 
7322   }
7323   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
7324 }
7325 
7326 void MacroAssembler::shlptr(Register dst, int imm8) {
7327   LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
7328 }
7329 
7330 void MacroAssembler::shrptr(Register dst, int imm8) {
7331   LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
7332 }
7333 
7334 void MacroAssembler::sign_extend_byte(Register reg) {
7335   if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
7336     movsbl(reg, reg); // movsxb
7337   } else {
7338     shll(reg, 24);
7339     sarl(reg, 24);
7340   }
7341 }
7342 
7343 void MacroAssembler::sign_extend_short(Register reg) {
7344   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
7345     movswl(reg, reg); // movsxw
7346   } else {
7347     shll(reg, 16);
7348     sarl(reg, 16);
7349   }
7350 }
7351 
7352 void MacroAssembler::testl(Register dst, AddressLiteral src) {
7353   assert(reachable(src), "Address should be reachable");
7354   testl(dst, as_Address(src));
7355 }
7356 
7357 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
7358   if (reachable(src)) {
7359     Assembler::sqrtsd(dst, as_Address(src));
7360   } else {
7361     lea(rscratch1, src);
7362     Assembler::sqrtsd(dst, Address(rscratch1, 0));
7363   }
7364 }
7365 
7366 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
7367   if (reachable(src)) {
7368     Assembler::sqrtss(dst, as_Address(src));
7369   } else {
7370     lea(rscratch1, src);
7371     Assembler::sqrtss(dst, Address(rscratch1, 0));
7372   }
7373 }
7374 
7375 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
7376   if (reachable(src)) {
7377     Assembler::subsd(dst, as_Address(src));
7378   } else {
7379     lea(rscratch1, src);
7380     Assembler::subsd(dst, Address(rscratch1, 0));
7381   }
7382 }
7383 
7384 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
7385   if (reachable(src)) {
7386     Assembler::subss(dst, as_Address(src));
7387   } else {
7388     lea(rscratch1, src);
7389     Assembler::subss(dst, Address(rscratch1, 0));
7390   }
7391 }
7392 
7393 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
7394   if (reachable(src)) {
7395     Assembler::ucomisd(dst, as_Address(src));
7396   } else {
7397     lea(rscratch1, src);
7398     Assembler::ucomisd(dst, Address(rscratch1, 0));
7399   }
7400 }
7401 
7402 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
7403   if (reachable(src)) {
7404     Assembler::ucomiss(dst, as_Address(src));
7405   } else {
7406     lea(rscratch1, src);
7407     Assembler::ucomiss(dst, Address(rscratch1, 0));
7408   }
7409 }
7410 
7411 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
7412   // Used in sign-bit flipping with aligned address.
7413   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
7414   if (reachable(src)) {
7415     Assembler::xorpd(dst, as_Address(src));
7416   } else {
7417     lea(rscratch1, src);
7418     Assembler::xorpd(dst, Address(rscratch1, 0));
7419   }
7420 }
7421 
7422 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
7423   // Used in sign-bit flipping with aligned address.
7424   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
7425   if (reachable(src)) {
7426     Assembler::xorps(dst, as_Address(src));
7427   } else {
7428     lea(rscratch1, src);
7429     Assembler::xorps(dst, Address(rscratch1, 0));
7430   }
7431 }
7432 
7433 // AVX 3-operands instructions
7434 
7435 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7436   if (reachable(src)) {
7437     vaddsd(dst, nds, as_Address(src));
7438   } else {
7439     lea(rscratch1, src);
7440     vaddsd(dst, nds, Address(rscratch1, 0));
7441   }
7442 }
7443 
7444 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7445   if (reachable(src)) {
7446     vaddss(dst, nds, as_Address(src));
7447   } else {
7448     lea(rscratch1, src);
7449     vaddss(dst, nds, Address(rscratch1, 0));
7450   }
7451 }
7452 
7453 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7454   if (reachable(src)) {
7455     vandpd(dst, nds, as_Address(src));
7456   } else {
7457     lea(rscratch1, src);
7458     vandpd(dst, nds, Address(rscratch1, 0));
7459   }
7460 }
7461 
7462 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7463   if (reachable(src)) {
7464     vandps(dst, nds, as_Address(src));
7465   } else {
7466     lea(rscratch1, src);
7467     vandps(dst, nds, Address(rscratch1, 0));
7468   }
7469 }
7470 
7471 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7472   if (reachable(src)) {
7473     vdivsd(dst, nds, as_Address(src));
7474   } else {
7475     lea(rscratch1, src);
7476     vdivsd(dst, nds, Address(rscratch1, 0));
7477   }
7478 }
7479 
7480 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7481   if (reachable(src)) {
7482     vdivss(dst, nds, as_Address(src));
7483   } else {
7484     lea(rscratch1, src);
7485     vdivss(dst, nds, Address(rscratch1, 0));
7486   }
7487 }
7488 
7489 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7490   if (reachable(src)) {
7491     vmulsd(dst, nds, as_Address(src));
7492   } else {
7493     lea(rscratch1, src);
7494     vmulsd(dst, nds, Address(rscratch1, 0));
7495   }
7496 }
7497 
7498 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7499   if (reachable(src)) {
7500     vmulss(dst, nds, as_Address(src));
7501   } else {
7502     lea(rscratch1, src);
7503     vmulss(dst, nds, Address(rscratch1, 0));
7504   }
7505 }
7506 
7507 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7508   if (reachable(src)) {
7509     vsubsd(dst, nds, as_Address(src));
7510   } else {
7511     lea(rscratch1, src);
7512     vsubsd(dst, nds, Address(rscratch1, 0));
7513   }
7514 }
7515 
7516 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7517   if (reachable(src)) {
7518     vsubss(dst, nds, as_Address(src));
7519   } else {
7520     lea(rscratch1, src);
7521     vsubss(dst, nds, Address(rscratch1, 0));
7522   }
7523 }
7524 
7525 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7526   if (reachable(src)) {
7527     vxorpd(dst, nds, as_Address(src));
7528   } else {
7529     lea(rscratch1, src);
7530     vxorpd(dst, nds, Address(rscratch1, 0));
7531   }
7532 }
7533 
7534 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7535   if (reachable(src)) {
7536     vxorps(dst, nds, as_Address(src));
7537   } else {
7538     lea(rscratch1, src);
7539     vxorps(dst, nds, Address(rscratch1, 0));
7540   }
7541 }
7542 
7543 
7544 //////////////////////////////////////////////////////////////////////////////////
7545 #ifndef SERIALGC
7546 
7547 void MacroAssembler::g1_write_barrier_pre(Register obj,
7548                                           Register pre_val,
7549                                           Register thread,
7550                                           Register tmp,
7551                                           bool tosca_live,
7552                                           bool expand_call) {
7553 
7554   // If expand_call is true then we expand the call_VM_leaf macro
7555   // directly to skip generating the check by
7556   // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
7557 
7558 #ifdef _LP64
7559   assert(thread == r15_thread, "must be");
7560 #endif // _LP64
7561 
7562   Label done;
7563   Label runtime;
7564 
7565   assert(pre_val != noreg, "check this code");
7566 
7567   if (obj != noreg) {
7568     assert_different_registers(obj, pre_val, tmp);
7569     assert(pre_val != rax, "check this code");
7570   }
7571 
7572   Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
7573                                        PtrQueue::byte_offset_of_active()));
7574   Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
7575                                        PtrQueue::byte_offset_of_index()));
7576   Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
7577                                        PtrQueue::byte_offset_of_buf()));
7578 
7579 
7580   // Is marking active?
7581   if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
7582     cmpl(in_progress, 0);
7583   } else {
7584     assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
7585     cmpb(in_progress, 0);
7586   }
7587   jcc(Assembler::equal, done);
7588 
7589   // Do we need to load the previous value?
7590   if (obj != noreg) {
7591     load_heap_oop(pre_val, Address(obj, 0));
7592   }
7593 
7594   // Is the previous value null?
7595   cmpptr(pre_val, (int32_t) NULL_WORD);
7596   jcc(Assembler::equal, done);
7597 
7598   // Can we store original value in the thread's buffer?
7599   // Is index == 0?
7600   // (The index field is typed as size_t.)
7601 
7602   movptr(tmp, index);                   // tmp := *index_adr
7603   cmpptr(tmp, 0);                       // tmp == 0?
7604   jcc(Assembler::equal, runtime);       // If yes, goto runtime
7605 
7606   subptr(tmp, wordSize);                // tmp := tmp - wordSize
7607   movptr(index, tmp);                   // *index_adr := tmp
7608   addptr(tmp, buffer);                  // tmp := tmp + *buffer_adr
7609 
7610   // Record the previous value
7611   movptr(Address(tmp, 0), pre_val);
7612   jmp(done);
7613 
7614   bind(runtime);
7615   // save the live input values
7616   if(tosca_live) push(rax);
7617 
7618   if (obj != noreg && obj != rax)
7619     push(obj);
7620 
7621   if (pre_val != rax)
7622     push(pre_val);
7623 
7624   // Calling the runtime using the regular call_VM_leaf mechanism generates
7625   // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
7626   // that checks that the *(ebp+frame::interpreter_frame_last_sp) == NULL.
7627   //
7628   // If we care generating the pre-barrier without a frame (e.g. in the
7629   // intrinsified Reference.get() routine) then ebp might be pointing to
7630   // the caller frame and so this check will most likely fail at runtime.
7631   //
7632   // Expanding the call directly bypasses the generation of the check.
7633   // So when we do not have have a full interpreter frame on the stack
7634   // expand_call should be passed true.
7635 
7636   NOT_LP64( push(thread); )
7637 
7638   if (expand_call) {
7639     LP64_ONLY( assert(pre_val != c_rarg1, "smashed arg"); )
7640     pass_arg1(this, thread);
7641     pass_arg0(this, pre_val);
7642     MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
7643   } else {
7644     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
7645   }
7646 
7647   NOT_LP64( pop(thread); )
7648 
7649   // save the live input values
7650   if (pre_val != rax)
7651     pop(pre_val);
7652 
7653   if (obj != noreg && obj != rax)
7654     pop(obj);
7655 
7656   if(tosca_live) pop(rax);
7657 
7658   bind(done);
7659 }
7660 
7661 void MacroAssembler::g1_write_barrier_post(Register store_addr,
7662                                            Register new_val,
7663                                            Register thread,
7664                                            Register tmp,
7665                                            Register tmp2) {
7666 #ifdef _LP64
7667   assert(thread == r15_thread, "must be");
7668 #endif // _LP64
7669 
7670   Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
7671                                        PtrQueue::byte_offset_of_index()));
7672   Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
7673                                        PtrQueue::byte_offset_of_buf()));
7674 
7675   BarrierSet* bs = Universe::heap()->barrier_set();
7676   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
7677   Label done;
7678   Label runtime;
7679 
7680   // Does store cross heap regions?
7681 
7682   movptr(tmp, store_addr);
7683   xorptr(tmp, new_val);
7684   shrptr(tmp, HeapRegion::LogOfHRGrainBytes);
7685   jcc(Assembler::equal, done);
7686 
7687   // crosses regions, storing NULL?
7688 
7689   cmpptr(new_val, (int32_t) NULL_WORD);
7690   jcc(Assembler::equal, done);
7691 
7692   // storing region crossing non-NULL, is card already dirty?
7693 
7694   ExternalAddress cardtable((address) ct->byte_map_base);
7695   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
7696 #ifdef _LP64
7697   const Register card_addr = tmp;
7698 
7699   movq(card_addr, store_addr);
7700   shrq(card_addr, CardTableModRefBS::card_shift);
7701 
7702   lea(tmp2, cardtable);
7703 
7704   // get the address of the card
7705   addq(card_addr, tmp2);
7706 #else
7707   const Register card_index = tmp;
7708 
7709   movl(card_index, store_addr);
7710   shrl(card_index, CardTableModRefBS::card_shift);
7711 
7712   Address index(noreg, card_index, Address::times_1);
7713   const Register card_addr = tmp;
7714   lea(card_addr, as_Address(ArrayAddress(cardtable, index)));
7715 #endif
7716   cmpb(Address(card_addr, 0), 0);
7717   jcc(Assembler::equal, done);
7718 
7719   // storing a region crossing, non-NULL oop, card is clean.
7720   // dirty card and log.
7721 
7722   movb(Address(card_addr, 0), 0);
7723 
7724   cmpl(queue_index, 0);
7725   jcc(Assembler::equal, runtime);
7726   subl(queue_index, wordSize);
7727   movptr(tmp2, buffer);
7728 #ifdef _LP64
7729   movslq(rscratch1, queue_index);
7730   addq(tmp2, rscratch1);
7731   movq(Address(tmp2, 0), card_addr);
7732 #else
7733   addl(tmp2, queue_index);
7734   movl(Address(tmp2, 0), card_index);
7735 #endif
7736   jmp(done);
7737 
7738   bind(runtime);
7739   // save the live input values
7740   push(store_addr);
7741   push(new_val);
7742 #ifdef _LP64
7743   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, r15_thread);
7744 #else
7745   push(thread);
7746   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
7747   pop(thread);
7748 #endif
7749   pop(new_val);
7750   pop(store_addr);
7751 
7752   bind(done);
7753 }
7754 
7755 #endif // SERIALGC
7756 //////////////////////////////////////////////////////////////////////////////////
7757 
7758 
7759 void MacroAssembler::store_check(Register obj) {
7760   // Does a store check for the oop in register obj. The content of
7761   // register obj is destroyed afterwards.
7762   store_check_part_1(obj);
7763   store_check_part_2(obj);
7764 }
7765 
7766 void MacroAssembler::store_check(Register obj, Address dst) {
7767   store_check(obj);
7768 }
7769 
7770 
7771 // split the store check operation so that other instructions can be scheduled inbetween
7772 void MacroAssembler::store_check_part_1(Register obj) {
7773   BarrierSet* bs = Universe::heap()->barrier_set();
7774   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
7775   shrptr(obj, CardTableModRefBS::card_shift);
7776 }
7777 
7778 void MacroAssembler::store_check_part_2(Register obj) {
7779   BarrierSet* bs = Universe::heap()->barrier_set();
7780   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
7781   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
7782   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
7783 
7784   // The calculation for byte_map_base is as follows:
7785   // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
7786   // So this essentially converts an address to a displacement and
7787   // it will never need to be relocated. On 64bit however the value may be too
7788   // large for a 32bit displacement
7789 
7790   intptr_t disp = (intptr_t) ct->byte_map_base;
7791   if (is_simm32(disp)) {
7792     Address cardtable(noreg, obj, Address::times_1, disp);
7793     movb(cardtable, 0);
7794   } else {
7795     // By doing it as an ExternalAddress disp could be converted to a rip-relative
7796     // displacement and done in a single instruction given favorable mapping and
7797     // a smarter version of as_Address. Worst case it is two instructions which
7798     // is no worse off then loading disp into a register and doing as a simple
7799     // Address() as above.
7800     // We can't do as ExternalAddress as the only style since if disp == 0 we'll
7801     // assert since NULL isn't acceptable in a reloci (see 6644928). In any case
7802     // in some cases we'll get a single instruction version.
7803 
7804     ExternalAddress cardtable((address)disp);
7805     Address index(noreg, obj, Address::times_1);
7806     movb(as_Address(ArrayAddress(cardtable, index)), 0);
7807   }
7808 }
7809 
7810 void MacroAssembler::subptr(Register dst, int32_t imm32) {
7811   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
7812 }
7813 
7814 void MacroAssembler::subptr(Register dst, Register src) {
7815   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
7816 }
7817 
7818 // C++ bool manipulation
7819 void MacroAssembler::testbool(Register dst) {
7820   if(sizeof(bool) == 1)
7821     testb(dst, 0xff);
7822   else if(sizeof(bool) == 2) {
7823     // testw implementation needed for two byte bools
7824     ShouldNotReachHere();
7825   } else if(sizeof(bool) == 4)
7826     testl(dst, dst);
7827   else
7828     // unsupported
7829     ShouldNotReachHere();
7830 }
7831 
7832 void MacroAssembler::testptr(Register dst, Register src) {
7833   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
7834 }
7835 
7836 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
7837 void MacroAssembler::tlab_allocate(Register obj,
7838                                    Register var_size_in_bytes,
7839                                    int con_size_in_bytes,
7840                                    Register t1,
7841                                    Register t2,
7842                                    Label& slow_case) {
7843   assert_different_registers(obj, t1, t2);
7844   assert_different_registers(obj, var_size_in_bytes, t1);
7845   Register end = t2;
7846   Register thread = NOT_LP64(t1) LP64_ONLY(r15_thread);
7847 
7848   verify_tlab();
7849 
7850   NOT_LP64(get_thread(thread));
7851 
7852   movptr(obj, Address(thread, JavaThread::tlab_top_offset()));
7853   if (var_size_in_bytes == noreg) {
7854     lea(end, Address(obj, con_size_in_bytes));
7855   } else {
7856     lea(end, Address(obj, var_size_in_bytes, Address::times_1));
7857   }
7858   cmpptr(end, Address(thread, JavaThread::tlab_end_offset()));
7859   jcc(Assembler::above, slow_case);
7860 
7861   // update the tlab top pointer
7862   movptr(Address(thread, JavaThread::tlab_top_offset()), end);
7863 
7864   // recover var_size_in_bytes if necessary
7865   if (var_size_in_bytes == end) {
7866     subptr(var_size_in_bytes, obj);
7867   }
7868   verify_tlab();
7869 }
7870 
7871 // Preserves rbx, and rdx.
7872 Register MacroAssembler::tlab_refill(Label& retry,
7873                                      Label& try_eden,
7874                                      Label& slow_case) {
7875   Register top = rax;
7876   Register t1  = rcx;
7877   Register t2  = rsi;
7878   Register thread_reg = NOT_LP64(rdi) LP64_ONLY(r15_thread);
7879   assert_different_registers(top, thread_reg, t1, t2, /* preserve: */ rbx, rdx);
7880   Label do_refill, discard_tlab;
7881 
7882   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
7883     // No allocation in the shared eden.
7884     jmp(slow_case);
7885   }
7886 
7887   NOT_LP64(get_thread(thread_reg));
7888 
7889   movptr(top, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
7890   movptr(t1,  Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
7891 
7892   // calculate amount of free space
7893   subptr(t1, top);
7894   shrptr(t1, LogHeapWordSize);
7895 
7896   // Retain tlab and allocate object in shared space if
7897   // the amount free in the tlab is too large to discard.
7898   cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
7899   jcc(Assembler::lessEqual, discard_tlab);
7900 
7901   // Retain
7902   // %%% yuck as movptr...
7903   movptr(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
7904   addptr(Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())), t2);
7905   if (TLABStats) {
7906     // increment number of slow_allocations
7907     addl(Address(thread_reg, in_bytes(JavaThread::tlab_slow_allocations_offset())), 1);
7908   }
7909   jmp(try_eden);
7910 
7911   bind(discard_tlab);
7912   if (TLABStats) {
7913     // increment number of refills
7914     addl(Address(thread_reg, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1);
7915     // accumulate wastage -- t1 is amount free in tlab
7916     addl(Address(thread_reg, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1);
7917   }
7918 
7919   // if tlab is currently allocated (top or end != null) then
7920   // fill [top, end + alignment_reserve) with array object
7921   testptr(top, top);
7922   jcc(Assembler::zero, do_refill);
7923 
7924   // set up the mark word
7925   movptr(Address(top, oopDesc::mark_offset_in_bytes()), (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
7926   // set the length to the remaining space
7927   subptr(t1, typeArrayOopDesc::header_size(T_INT));
7928   addptr(t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
7929   shlptr(t1, log2_intptr(HeapWordSize/sizeof(jint)));
7930   movl(Address(top, arrayOopDesc::length_offset_in_bytes()), t1);
7931   // set klass to intArrayKlass
7932   // dubious reloc why not an oop reloc?
7933   movptr(t1, ExternalAddress((address)Universe::intArrayKlassObj_addr()));
7934   // store klass last.  concurrent gcs assumes klass length is valid if
7935   // klass field is not null.
7936   store_klass(top, t1);
7937 
7938   movptr(t1, top);
7939   subptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
7940   incr_allocated_bytes(thread_reg, t1, 0);
7941 
7942   // refill the tlab with an eden allocation
7943   bind(do_refill);
7944   movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
7945   shlptr(t1, LogHeapWordSize);
7946   // allocate new tlab, address returned in top
7947   eden_allocate(top, t1, 0, t2, slow_case);
7948 
7949   // Check that t1 was preserved in eden_allocate.
7950 #ifdef ASSERT
7951   if (UseTLAB) {
7952     Label ok;
7953     Register tsize = rsi;
7954     assert_different_registers(tsize, thread_reg, t1);
7955     push(tsize);
7956     movptr(tsize, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
7957     shlptr(tsize, LogHeapWordSize);
7958     cmpptr(t1, tsize);
7959     jcc(Assembler::equal, ok);
7960     stop("assert(t1 != tlab size)");
7961     should_not_reach_here();
7962 
7963     bind(ok);
7964     pop(tsize);
7965   }
7966 #endif
7967   movptr(Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())), top);
7968   movptr(Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())), top);
7969   addptr(top, t1);
7970   subptr(top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
7971   movptr(Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())), top);
7972   verify_tlab();
7973   jmp(retry);
7974 
7975   return thread_reg; // for use by caller
7976 }
7977 
7978 void MacroAssembler::incr_allocated_bytes(Register thread,
7979                                           Register var_size_in_bytes,
7980                                           int con_size_in_bytes,
7981                                           Register t1) {
7982 #ifdef _LP64
7983   if (var_size_in_bytes->is_valid()) {
7984     addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
7985   } else {
7986     addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
7987   }
7988 #else
7989   if (!thread->is_valid()) {
7990     assert(t1->is_valid(), "need temp reg");
7991     thread = t1;
7992     get_thread(thread);
7993   }
7994 
7995   if (var_size_in_bytes->is_valid()) {
7996     addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
7997   } else {
7998     addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
7999   }
8000   adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
8001 #endif
8002 }
8003 
8004 static const double     pi_4 =  0.7853981633974483;
8005 
8006 void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
8007   // A hand-coded argument reduction for values in fabs(pi/4, pi/2)
8008   // was attempted in this code; unfortunately it appears that the
8009   // switch to 80-bit precision and back causes this to be
8010   // unprofitable compared with simply performing a runtime call if
8011   // the argument is out of the (-pi/4, pi/4) range.
8012 
8013   Register tmp = noreg;
8014   if (!VM_Version::supports_cmov()) {
8015     // fcmp needs a temporary so preserve rbx,
8016     tmp = rbx;
8017     push(tmp);
8018   }
8019 
8020   Label slow_case, done;
8021 
8022   ExternalAddress pi4_adr = (address)&pi_4;
8023   if (reachable(pi4_adr)) {
8024     // x ?<= pi/4
8025     fld_d(pi4_adr);
8026     fld_s(1);                // Stack:  X  PI/4  X
8027     fabs();                  // Stack: |X| PI/4  X
8028     fcmp(tmp);
8029     jcc(Assembler::above, slow_case);
8030 
8031     // fastest case: -pi/4 <= x <= pi/4
8032     switch(trig) {
8033     case 's':
8034       fsin();
8035       break;
8036     case 'c':
8037       fcos();
8038       break;
8039     case 't':
8040       ftan();
8041       break;
8042     default:
8043       assert(false, "bad intrinsic");
8044       break;
8045     }
8046     jmp(done);
8047   }
8048 
8049   // slow case: runtime call
8050   bind(slow_case);
8051   // Preserve registers across runtime call
8052   pusha();
8053   int incoming_argument_and_return_value_offset = -1;
8054   if (num_fpu_regs_in_use > 1) {
8055     // Must preserve all other FPU regs (could alternatively convert
8056     // SharedRuntime::dsin and dcos into assembly routines known not to trash
8057     // FPU state, but can not trust C compiler)
8058     NEEDS_CLEANUP;
8059     // NOTE that in this case we also push the incoming argument to
8060     // the stack and restore it later; we also use this stack slot to
8061     // hold the return value from dsin or dcos.
8062     for (int i = 0; i < num_fpu_regs_in_use; i++) {
8063       subptr(rsp, sizeof(jdouble));
8064       fstp_d(Address(rsp, 0));
8065     }
8066     incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
8067     fld_d(Address(rsp, incoming_argument_and_return_value_offset));
8068   }
8069   subptr(rsp, sizeof(jdouble));
8070   fstp_d(Address(rsp, 0));
8071 #ifdef _LP64
8072   movdbl(xmm0, Address(rsp, 0));
8073 #endif // _LP64
8074 
8075   // NOTE: we must not use call_VM_leaf here because that requires a
8076   // complete interpreter frame in debug mode -- same bug as 4387334
8077   // MacroAssembler::call_VM_leaf_base is perfectly safe and will
8078   // do proper 64bit abi
8079 
8080   NEEDS_CLEANUP;
8081   // Need to add stack banging before this runtime call if it needs to
8082   // be taken; however, there is no generic stack banging routine at
8083   // the MacroAssembler level
8084   switch(trig) {
8085   case 's':
8086     {
8087       MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 0);
8088     }
8089     break;
8090   case 'c':
8091     {
8092       MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 0);
8093     }
8094     break;
8095   case 't':
8096     {
8097       MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 0);
8098     }
8099     break;
8100   default:
8101     assert(false, "bad intrinsic");
8102     break;
8103   }
8104 #ifdef _LP64
8105     movsd(Address(rsp, 0), xmm0);
8106     fld_d(Address(rsp, 0));
8107 #endif // _LP64
8108   addptr(rsp, sizeof(jdouble));
8109   if (num_fpu_regs_in_use > 1) {
8110     // Must save return value to stack and then restore entire FPU stack
8111     fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
8112     for (int i = 0; i < num_fpu_regs_in_use; i++) {
8113       fld_d(Address(rsp, 0));
8114       addptr(rsp, sizeof(jdouble));
8115     }
8116   }
8117   popa();
8118 
8119   // Come here with result in F-TOS
8120   bind(done);
8121 
8122   if (tmp != noreg) {
8123     pop(tmp);
8124   }
8125 }
8126 
8127 
8128 // Look up the method for a megamorphic invokeinterface call.
8129 // The target method is determined by <intf_klass, itable_index>.
8130 // The receiver klass is in recv_klass.
8131 // On success, the result will be in method_result, and execution falls through.
8132 // On failure, execution transfers to the given label.
8133 void MacroAssembler::lookup_interface_method(Register recv_klass,
8134                                              Register intf_klass,
8135                                              RegisterOrConstant itable_index,
8136                                              Register method_result,
8137                                              Register scan_temp,
8138                                              Label& L_no_such_interface) {
8139   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
8140   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
8141          "caller must use same register for non-constant itable index as for method");
8142 
8143   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
8144   int vtable_base = instanceKlass::vtable_start_offset() * wordSize;
8145   int itentry_off = itableMethodEntry::method_offset_in_bytes();
8146   int scan_step   = itableOffsetEntry::size() * wordSize;
8147   int vte_size    = vtableEntry::size() * wordSize;
8148   Address::ScaleFactor times_vte_scale = Address::times_ptr;
8149   assert(vte_size == wordSize, "else adjust times_vte_scale");
8150 
8151   movl(scan_temp, Address(recv_klass, instanceKlass::vtable_length_offset() * wordSize));
8152 
8153   // %%% Could store the aligned, prescaled offset in the klassoop.
8154   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
8155   if (HeapWordsPerLong > 1) {
8156     // Round up to align_object_offset boundary
8157     // see code for instanceKlass::start_of_itable!
8158     round_to(scan_temp, BytesPerLong);
8159   }
8160 
8161   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
8162   assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
8163   lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
8164 
8165   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
8166   //   if (scan->interface() == intf) {
8167   //     result = (klass + scan->offset() + itable_index);
8168   //   }
8169   // }
8170   Label search, found_method;
8171 
8172   for (int peel = 1; peel >= 0; peel--) {
8173     movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
8174     cmpptr(intf_klass, method_result);
8175 
8176     if (peel) {
8177       jccb(Assembler::equal, found_method);
8178     } else {
8179       jccb(Assembler::notEqual, search);
8180       // (invert the test to fall through to found_method...)
8181     }
8182 
8183     if (!peel)  break;
8184 
8185     bind(search);
8186 
8187     // Check that the previous entry is non-null.  A null entry means that
8188     // the receiver class doesn't implement the interface, and wasn't the
8189     // same as when the caller was compiled.
8190     testptr(method_result, method_result);
8191     jcc(Assembler::zero, L_no_such_interface);
8192     addptr(scan_temp, scan_step);
8193   }
8194 
8195   bind(found_method);
8196 
8197   // Got a hit.
8198   movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
8199   movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
8200 }
8201 
8202 
8203 void MacroAssembler::check_klass_subtype(Register sub_klass,
8204                            Register super_klass,
8205                            Register temp_reg,
8206                            Label& L_success) {
8207   Label L_failure;
8208   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
8209   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
8210   bind(L_failure);
8211 }
8212 
8213 
8214 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
8215                                                    Register super_klass,
8216                                                    Register temp_reg,
8217                                                    Label* L_success,
8218                                                    Label* L_failure,
8219                                                    Label* L_slow_path,
8220                                         RegisterOrConstant super_check_offset) {
8221   assert_different_registers(sub_klass, super_klass, temp_reg);
8222   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
8223   if (super_check_offset.is_register()) {
8224     assert_different_registers(sub_klass, super_klass,
8225                                super_check_offset.as_register());
8226   } else if (must_load_sco) {
8227     assert(temp_reg != noreg, "supply either a temp or a register offset");
8228   }
8229 
8230   Label L_fallthrough;
8231   int label_nulls = 0;
8232   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
8233   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
8234   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
8235   assert(label_nulls <= 1, "at most one NULL in the batch");
8236 
8237   int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
8238                    Klass::secondary_super_cache_offset_in_bytes());
8239   int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
8240                     Klass::super_check_offset_offset_in_bytes());
8241   Address super_check_offset_addr(super_klass, sco_offset);
8242 
8243   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
8244   // range of a jccb.  If this routine grows larger, reconsider at
8245   // least some of these.
8246 #define local_jcc(assembler_cond, label)                                \
8247   if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
8248   else                             jcc( assembler_cond, label) /*omit semi*/
8249 
8250   // Hacked jmp, which may only be used just before L_fallthrough.
8251 #define final_jmp(label)                                                \
8252   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
8253   else                            jmp(label)                /*omit semi*/
8254 
8255   // If the pointers are equal, we are done (e.g., String[] elements).
8256   // This self-check enables sharing of secondary supertype arrays among
8257   // non-primary types such as array-of-interface.  Otherwise, each such
8258   // type would need its own customized SSA.
8259   // We move this check to the front of the fast path because many
8260   // type checks are in fact trivially successful in this manner,
8261   // so we get a nicely predicted branch right at the start of the check.
8262   cmpptr(sub_klass, super_klass);
8263   local_jcc(Assembler::equal, *L_success);
8264 
8265   // Check the supertype display:
8266   if (must_load_sco) {
8267     // Positive movl does right thing on LP64.
8268     movl(temp_reg, super_check_offset_addr);
8269     super_check_offset = RegisterOrConstant(temp_reg);
8270   }
8271   Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
8272   cmpptr(super_klass, super_check_addr); // load displayed supertype
8273 
8274   // This check has worked decisively for primary supers.
8275   // Secondary supers are sought in the super_cache ('super_cache_addr').
8276   // (Secondary supers are interfaces and very deeply nested subtypes.)
8277   // This works in the same check above because of a tricky aliasing
8278   // between the super_cache and the primary super display elements.
8279   // (The 'super_check_addr' can address either, as the case requires.)
8280   // Note that the cache is updated below if it does not help us find
8281   // what we need immediately.
8282   // So if it was a primary super, we can just fail immediately.
8283   // Otherwise, it's the slow path for us (no success at this point).
8284 
8285   if (super_check_offset.is_register()) {
8286     local_jcc(Assembler::equal, *L_success);
8287     cmpl(super_check_offset.as_register(), sc_offset);
8288     if (L_failure == &L_fallthrough) {
8289       local_jcc(Assembler::equal, *L_slow_path);
8290     } else {
8291       local_jcc(Assembler::notEqual, *L_failure);
8292       final_jmp(*L_slow_path);
8293     }
8294   } else if (super_check_offset.as_constant() == sc_offset) {
8295     // Need a slow path; fast failure is impossible.
8296     if (L_slow_path == &L_fallthrough) {
8297       local_jcc(Assembler::equal, *L_success);
8298     } else {
8299       local_jcc(Assembler::notEqual, *L_slow_path);
8300       final_jmp(*L_success);
8301     }
8302   } else {
8303     // No slow path; it's a fast decision.
8304     if (L_failure == &L_fallthrough) {
8305       local_jcc(Assembler::equal, *L_success);
8306     } else {
8307       local_jcc(Assembler::notEqual, *L_failure);
8308       final_jmp(*L_success);
8309     }
8310   }
8311 
8312   bind(L_fallthrough);
8313 
8314 #undef local_jcc
8315 #undef final_jmp
8316 }
8317 
8318 
8319 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
8320                                                    Register super_klass,
8321                                                    Register temp_reg,
8322                                                    Register temp2_reg,
8323                                                    Label* L_success,
8324                                                    Label* L_failure,
8325                                                    bool set_cond_codes) {
8326   assert_different_registers(sub_klass, super_klass, temp_reg);
8327   if (temp2_reg != noreg)
8328     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
8329 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
8330 
8331   Label L_fallthrough;
8332   int label_nulls = 0;
8333   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
8334   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
8335   assert(label_nulls <= 1, "at most one NULL in the batch");
8336 
8337   // a couple of useful fields in sub_klass:
8338   int ss_offset = (klassOopDesc::header_size() * HeapWordSize +
8339                    Klass::secondary_supers_offset_in_bytes());
8340   int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
8341                    Klass::secondary_super_cache_offset_in_bytes());
8342   Address secondary_supers_addr(sub_klass, ss_offset);
8343   Address super_cache_addr(     sub_klass, sc_offset);
8344 
8345   // Do a linear scan of the secondary super-klass chain.
8346   // This code is rarely used, so simplicity is a virtue here.
8347   // The repne_scan instruction uses fixed registers, which we must spill.
8348   // Don't worry too much about pre-existing connections with the input regs.
8349 
8350   assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
8351   assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
8352 
8353   // Get super_klass value into rax (even if it was in rdi or rcx).
8354   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
8355   if (super_klass != rax || UseCompressedOops) {
8356     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
8357     mov(rax, super_klass);
8358   }
8359   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
8360   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
8361 
8362 #ifndef PRODUCT
8363   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
8364   ExternalAddress pst_counter_addr((address) pst_counter);
8365   NOT_LP64(  incrementl(pst_counter_addr) );
8366   LP64_ONLY( lea(rcx, pst_counter_addr) );
8367   LP64_ONLY( incrementl(Address(rcx, 0)) );
8368 #endif //PRODUCT
8369 
8370   // We will consult the secondary-super array.
8371   movptr(rdi, secondary_supers_addr);
8372   // Load the array length.  (Positive movl does right thing on LP64.)
8373   movl(rcx, Address(rdi, arrayOopDesc::length_offset_in_bytes()));
8374   // Skip to start of data.
8375   addptr(rdi, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
8376 
8377   // Scan RCX words at [RDI] for an occurrence of RAX.
8378   // Set NZ/Z based on last compare.
8379   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
8380   // not change flags (only scas instruction which is repeated sets flags).
8381   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
8382 #ifdef _LP64
8383   // This part is tricky, as values in supers array could be 32 or 64 bit wide
8384   // and we store values in objArrays always encoded, thus we need to encode
8385   // the value of rax before repne.  Note that rax is dead after the repne.
8386   if (UseCompressedOops) {
8387     encode_heap_oop_not_null(rax); // Changes flags.
8388     // The superclass is never null; it would be a basic system error if a null
8389     // pointer were to sneak in here.  Note that we have already loaded the
8390     // Klass::super_check_offset from the super_klass in the fast path,
8391     // so if there is a null in that register, we are already in the afterlife.
8392     testl(rax,rax); // Set Z = 0
8393     repne_scanl();
8394   } else
8395 #endif // _LP64
8396   {
8397     testptr(rax,rax); // Set Z = 0
8398     repne_scan();
8399   }
8400   // Unspill the temp. registers:
8401   if (pushed_rdi)  pop(rdi);
8402   if (pushed_rcx)  pop(rcx);
8403   if (pushed_rax)  pop(rax);
8404 
8405   if (set_cond_codes) {
8406     // Special hack for the AD files:  rdi is guaranteed non-zero.
8407     assert(!pushed_rdi, "rdi must be left non-NULL");
8408     // Also, the condition codes are properly set Z/NZ on succeed/failure.
8409   }
8410 
8411   if (L_failure == &L_fallthrough)
8412         jccb(Assembler::notEqual, *L_failure);
8413   else  jcc(Assembler::notEqual, *L_failure);
8414 
8415   // Success.  Cache the super we found and proceed in triumph.
8416   movptr(super_cache_addr, super_klass);
8417 
8418   if (L_success != &L_fallthrough) {
8419     jmp(*L_success);
8420   }
8421 
8422 #undef IS_A_TEMP
8423 
8424   bind(L_fallthrough);
8425 }
8426 
8427 
8428 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
8429   if (VM_Version::supports_cmov()) {
8430     cmovl(cc, dst, src);
8431   } else {
8432     Label L;
8433     jccb(negate_condition(cc), L);
8434     movl(dst, src);
8435     bind(L);
8436   }
8437 }
8438 
8439 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
8440   if (VM_Version::supports_cmov()) {
8441     cmovl(cc, dst, src);
8442   } else {
8443     Label L;
8444     jccb(negate_condition(cc), L);
8445     movl(dst, src);
8446     bind(L);
8447   }
8448 }
8449 
8450 void MacroAssembler::verify_oop(Register reg, const char* s) {
8451   if (!VerifyOops) return;
8452 
8453   // Pass register number to verify_oop_subroutine
8454   char* b = new char[strlen(s) + 50];
8455   sprintf(b, "verify_oop: %s: %s", reg->name(), s);
8456 #ifdef _LP64
8457   push(rscratch1);                    // save r10, trashed by movptr()
8458 #endif
8459   push(rax);                          // save rax,
8460   push(reg);                          // pass register argument
8461   ExternalAddress buffer((address) b);
8462   // avoid using pushptr, as it modifies scratch registers
8463   // and our contract is not to modify anything
8464   movptr(rax, buffer.addr());
8465   push(rax);
8466   // call indirectly to solve generation ordering problem
8467   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
8468   call(rax);
8469   // Caller pops the arguments (oop, message) and restores rax, r10
8470 }
8471 
8472 
8473 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
8474                                                       Register tmp,
8475                                                       int offset) {
8476   intptr_t value = *delayed_value_addr;
8477   if (value != 0)
8478     return RegisterOrConstant(value + offset);
8479 
8480   // load indirectly to solve generation ordering problem
8481   movptr(tmp, ExternalAddress((address) delayed_value_addr));
8482 
8483 #ifdef ASSERT
8484   { Label L;
8485     testptr(tmp, tmp);
8486     if (WizardMode) {
8487       jcc(Assembler::notZero, L);
8488       char* buf = new char[40];
8489       sprintf(buf, "DelayedValue="INTPTR_FORMAT, delayed_value_addr[1]);
8490       stop(buf);
8491     } else {
8492       jccb(Assembler::notZero, L);
8493       hlt();
8494     }
8495     bind(L);
8496   }
8497 #endif
8498 
8499   if (offset != 0)
8500     addptr(tmp, offset);
8501 
8502   return RegisterOrConstant(tmp);
8503 }
8504 
8505 
8506 // registers on entry:
8507 //  - rax ('check' register): required MethodType
8508 //  - rcx: method handle
8509 //  - rdx, rsi, or ?: killable temp
8510 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
8511                                               Register temp_reg,
8512                                               Label& wrong_method_type) {
8513   Address type_addr(mh_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg));
8514   // compare method type against that of the receiver
8515   if (UseCompressedOops) {
8516     load_heap_oop(temp_reg, type_addr);
8517     cmpptr(mtype_reg, temp_reg);
8518   } else {
8519     cmpptr(mtype_reg, type_addr);
8520   }
8521   jcc(Assembler::notEqual, wrong_method_type);
8522 }
8523 
8524 
8525 // A method handle has a "vmslots" field which gives the size of its
8526 // argument list in JVM stack slots.  This field is either located directly
8527 // in every method handle, or else is indirectly accessed through the
8528 // method handle's MethodType.  This macro hides the distinction.
8529 void MacroAssembler::load_method_handle_vmslots(Register vmslots_reg, Register mh_reg,
8530                                                 Register temp_reg) {
8531   assert_different_registers(vmslots_reg, mh_reg, temp_reg);
8532   // load mh.type.form.vmslots
8533   Register temp2_reg = vmslots_reg;
8534   load_heap_oop(temp2_reg, Address(mh_reg,    delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg)));
8535   load_heap_oop(temp2_reg, Address(temp2_reg, delayed_value(java_lang_invoke_MethodType::form_offset_in_bytes, temp_reg)));
8536   movl(vmslots_reg, Address(temp2_reg, delayed_value(java_lang_invoke_MethodTypeForm::vmslots_offset_in_bytes, temp_reg)));
8537 }
8538 
8539 
8540 // registers on entry:
8541 //  - rcx: method handle
8542 //  - rdx: killable temp (interpreted only)
8543 //  - rax: killable temp (compiled only)
8544 void MacroAssembler::jump_to_method_handle_entry(Register mh_reg, Register temp_reg) {
8545   assert(mh_reg == rcx, "caller must put MH object in rcx");
8546   assert_different_registers(mh_reg, temp_reg);
8547 
8548   // pick out the interpreted side of the handler
8549   // NOTE: vmentry is not an oop!
8550   movptr(temp_reg, Address(mh_reg, delayed_value(java_lang_invoke_MethodHandle::vmentry_offset_in_bytes, temp_reg)));
8551 
8552   // off we go...
8553   jmp(Address(temp_reg, MethodHandleEntry::from_interpreted_entry_offset_in_bytes()));
8554 
8555   // for the various stubs which take control at this point,
8556   // see MethodHandles::generate_method_handle_stub
8557 }
8558 
8559 
8560 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
8561                                          int extra_slot_offset) {
8562   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
8563   int stackElementSize = Interpreter::stackElementSize;
8564   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
8565 #ifdef ASSERT
8566   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
8567   assert(offset1 - offset == stackElementSize, "correct arithmetic");
8568 #endif
8569   Register             scale_reg    = noreg;
8570   Address::ScaleFactor scale_factor = Address::no_scale;
8571   if (arg_slot.is_constant()) {
8572     offset += arg_slot.as_constant() * stackElementSize;
8573   } else {
8574     scale_reg    = arg_slot.as_register();
8575     scale_factor = Address::times(stackElementSize);
8576   }
8577   offset += wordSize;           // return PC is on stack
8578   return Address(rsp, scale_reg, scale_factor, offset);
8579 }
8580 
8581 
8582 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
8583   if (!VerifyOops) return;
8584 
8585   // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
8586   // Pass register number to verify_oop_subroutine
8587   char* b = new char[strlen(s) + 50];
8588   sprintf(b, "verify_oop_addr: %s", s);
8589 
8590 #ifdef _LP64
8591   push(rscratch1);                    // save r10, trashed by movptr()
8592 #endif
8593   push(rax);                          // save rax,
8594   // addr may contain rsp so we will have to adjust it based on the push
8595   // we just did (and on 64 bit we do two pushes)
8596   // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
8597   // stores rax into addr which is backwards of what was intended.
8598   if (addr.uses(rsp)) {
8599     lea(rax, addr);
8600     pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
8601   } else {
8602     pushptr(addr);
8603   }
8604 
8605   ExternalAddress buffer((address) b);
8606   // pass msg argument
8607   // avoid using pushptr, as it modifies scratch registers
8608   // and our contract is not to modify anything
8609   movptr(rax, buffer.addr());
8610   push(rax);
8611 
8612   // call indirectly to solve generation ordering problem
8613   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
8614   call(rax);
8615   // Caller pops the arguments (addr, message) and restores rax, r10.
8616 }
8617 
8618 void MacroAssembler::verify_tlab() {
8619 #ifdef ASSERT
8620   if (UseTLAB && VerifyOops) {
8621     Label next, ok;
8622     Register t1 = rsi;
8623     Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
8624 
8625     push(t1);
8626     NOT_LP64(push(thread_reg));
8627     NOT_LP64(get_thread(thread_reg));
8628 
8629     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
8630     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
8631     jcc(Assembler::aboveEqual, next);
8632     stop("assert(top >= start)");
8633     should_not_reach_here();
8634 
8635     bind(next);
8636     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
8637     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
8638     jcc(Assembler::aboveEqual, ok);
8639     stop("assert(top <= end)");
8640     should_not_reach_here();
8641 
8642     bind(ok);
8643     NOT_LP64(pop(thread_reg));
8644     pop(t1);
8645   }
8646 #endif
8647 }
8648 
8649 class ControlWord {
8650  public:
8651   int32_t _value;
8652 
8653   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
8654   int  precision_control() const       { return  (_value >>  8) & 3      ; }
8655   bool precision() const               { return ((_value >>  5) & 1) != 0; }
8656   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
8657   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
8658   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
8659   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
8660   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
8661 
8662   void print() const {
8663     // rounding control
8664     const char* rc;
8665     switch (rounding_control()) {
8666       case 0: rc = "round near"; break;
8667       case 1: rc = "round down"; break;
8668       case 2: rc = "round up  "; break;
8669       case 3: rc = "chop      "; break;
8670     };
8671     // precision control
8672     const char* pc;
8673     switch (precision_control()) {
8674       case 0: pc = "24 bits "; break;
8675       case 1: pc = "reserved"; break;
8676       case 2: pc = "53 bits "; break;
8677       case 3: pc = "64 bits "; break;
8678     };
8679     // flags
8680     char f[9];
8681     f[0] = ' ';
8682     f[1] = ' ';
8683     f[2] = (precision   ()) ? 'P' : 'p';
8684     f[3] = (underflow   ()) ? 'U' : 'u';
8685     f[4] = (overflow    ()) ? 'O' : 'o';
8686     f[5] = (zero_divide ()) ? 'Z' : 'z';
8687     f[6] = (denormalized()) ? 'D' : 'd';
8688     f[7] = (invalid     ()) ? 'I' : 'i';
8689     f[8] = '\x0';
8690     // output
8691     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
8692   }
8693 
8694 };
8695 
8696 class StatusWord {
8697  public:
8698   int32_t _value;
8699 
8700   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
8701   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
8702   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
8703   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
8704   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
8705   int  top() const                     { return  (_value >> 11) & 7      ; }
8706   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
8707   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
8708   bool precision() const               { return ((_value >>  5) & 1) != 0; }
8709   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
8710   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
8711   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
8712   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
8713   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
8714 
8715   void print() const {
8716     // condition codes
8717     char c[5];
8718     c[0] = (C3()) ? '3' : '-';
8719     c[1] = (C2()) ? '2' : '-';
8720     c[2] = (C1()) ? '1' : '-';
8721     c[3] = (C0()) ? '0' : '-';
8722     c[4] = '\x0';
8723     // flags
8724     char f[9];
8725     f[0] = (error_status()) ? 'E' : '-';
8726     f[1] = (stack_fault ()) ? 'S' : '-';
8727     f[2] = (precision   ()) ? 'P' : '-';
8728     f[3] = (underflow   ()) ? 'U' : '-';
8729     f[4] = (overflow    ()) ? 'O' : '-';
8730     f[5] = (zero_divide ()) ? 'Z' : '-';
8731     f[6] = (denormalized()) ? 'D' : '-';
8732     f[7] = (invalid     ()) ? 'I' : '-';
8733     f[8] = '\x0';
8734     // output
8735     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
8736   }
8737 
8738 };
8739 
8740 class TagWord {
8741  public:
8742   int32_t _value;
8743 
8744   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
8745 
8746   void print() const {
8747     printf("%04x", _value & 0xFFFF);
8748   }
8749 
8750 };
8751 
8752 class FPU_Register {
8753  public:
8754   int32_t _m0;
8755   int32_t _m1;
8756   int16_t _ex;
8757 
8758   bool is_indefinite() const           {
8759     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
8760   }
8761 
8762   void print() const {
8763     char  sign = (_ex < 0) ? '-' : '+';
8764     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
8765     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
8766   };
8767 
8768 };
8769 
8770 class FPU_State {
8771  public:
8772   enum {
8773     register_size       = 10,
8774     number_of_registers =  8,
8775     register_mask       =  7
8776   };
8777 
8778   ControlWord  _control_word;
8779   StatusWord   _status_word;
8780   TagWord      _tag_word;
8781   int32_t      _error_offset;
8782   int32_t      _error_selector;
8783   int32_t      _data_offset;
8784   int32_t      _data_selector;
8785   int8_t       _register[register_size * number_of_registers];
8786 
8787   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
8788   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
8789 
8790   const char* tag_as_string(int tag) const {
8791     switch (tag) {
8792       case 0: return "valid";
8793       case 1: return "zero";
8794       case 2: return "special";
8795       case 3: return "empty";
8796     }
8797     ShouldNotReachHere();
8798     return NULL;
8799   }
8800 
8801   void print() const {
8802     // print computation registers
8803     { int t = _status_word.top();
8804       for (int i = 0; i < number_of_registers; i++) {
8805         int j = (i - t) & register_mask;
8806         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
8807         st(j)->print();
8808         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
8809       }
8810     }
8811     printf("\n");
8812     // print control registers
8813     printf("ctrl = "); _control_word.print(); printf("\n");
8814     printf("stat = "); _status_word .print(); printf("\n");
8815     printf("tags = "); _tag_word    .print(); printf("\n");
8816   }
8817 
8818 };
8819 
8820 class Flag_Register {
8821  public:
8822   int32_t _value;
8823 
8824   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
8825   bool direction() const               { return ((_value >> 10) & 1) != 0; }
8826   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
8827   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
8828   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
8829   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
8830   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
8831 
8832   void print() const {
8833     // flags
8834     char f[8];
8835     f[0] = (overflow       ()) ? 'O' : '-';
8836     f[1] = (direction      ()) ? 'D' : '-';
8837     f[2] = (sign           ()) ? 'S' : '-';
8838     f[3] = (zero           ()) ? 'Z' : '-';
8839     f[4] = (auxiliary_carry()) ? 'A' : '-';
8840     f[5] = (parity         ()) ? 'P' : '-';
8841     f[6] = (carry          ()) ? 'C' : '-';
8842     f[7] = '\x0';
8843     // output
8844     printf("%08x  flags = %s", _value, f);
8845   }
8846 
8847 };
8848 
8849 class IU_Register {
8850  public:
8851   int32_t _value;
8852 
8853   void print() const {
8854     printf("%08x  %11d", _value, _value);
8855   }
8856 
8857 };
8858 
8859 class IU_State {
8860  public:
8861   Flag_Register _eflags;
8862   IU_Register   _rdi;
8863   IU_Register   _rsi;
8864   IU_Register   _rbp;
8865   IU_Register   _rsp;
8866   IU_Register   _rbx;
8867   IU_Register   _rdx;
8868   IU_Register   _rcx;
8869   IU_Register   _rax;
8870 
8871   void print() const {
8872     // computation registers
8873     printf("rax,  = "); _rax.print(); printf("\n");
8874     printf("rbx,  = "); _rbx.print(); printf("\n");
8875     printf("rcx  = "); _rcx.print(); printf("\n");
8876     printf("rdx  = "); _rdx.print(); printf("\n");
8877     printf("rdi  = "); _rdi.print(); printf("\n");
8878     printf("rsi  = "); _rsi.print(); printf("\n");
8879     printf("rbp,  = "); _rbp.print(); printf("\n");
8880     printf("rsp  = "); _rsp.print(); printf("\n");
8881     printf("\n");
8882     // control registers
8883     printf("flgs = "); _eflags.print(); printf("\n");
8884   }
8885 };
8886 
8887 
8888 class CPU_State {
8889  public:
8890   FPU_State _fpu_state;
8891   IU_State  _iu_state;
8892 
8893   void print() const {
8894     printf("--------------------------------------------------\n");
8895     _iu_state .print();
8896     printf("\n");
8897     _fpu_state.print();
8898     printf("--------------------------------------------------\n");
8899   }
8900 
8901 };
8902 
8903 
8904 static void _print_CPU_state(CPU_State* state) {
8905   state->print();
8906 };
8907 
8908 
8909 void MacroAssembler::print_CPU_state() {
8910   push_CPU_state();
8911   push(rsp);                // pass CPU state
8912   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
8913   addptr(rsp, wordSize);       // discard argument
8914   pop_CPU_state();
8915 }
8916 
8917 
8918 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
8919   static int counter = 0;
8920   FPU_State* fs = &state->_fpu_state;
8921   counter++;
8922   // For leaf calls, only verify that the top few elements remain empty.
8923   // We only need 1 empty at the top for C2 code.
8924   if( stack_depth < 0 ) {
8925     if( fs->tag_for_st(7) != 3 ) {
8926       printf("FPR7 not empty\n");
8927       state->print();
8928       assert(false, "error");
8929       return false;
8930     }
8931     return true;                // All other stack states do not matter
8932   }
8933 
8934   assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
8935          "bad FPU control word");
8936 
8937   // compute stack depth
8938   int i = 0;
8939   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
8940   int d = i;
8941   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
8942   // verify findings
8943   if (i != FPU_State::number_of_registers) {
8944     // stack not contiguous
8945     printf("%s: stack not contiguous at ST%d\n", s, i);
8946     state->print();
8947     assert(false, "error");
8948     return false;
8949   }
8950   // check if computed stack depth corresponds to expected stack depth
8951   if (stack_depth < 0) {
8952     // expected stack depth is -stack_depth or less
8953     if (d > -stack_depth) {
8954       // too many elements on the stack
8955       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
8956       state->print();
8957       assert(false, "error");
8958       return false;
8959     }
8960   } else {
8961     // expected stack depth is stack_depth
8962     if (d != stack_depth) {
8963       // wrong stack depth
8964       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
8965       state->print();
8966       assert(false, "error");
8967       return false;
8968     }
8969   }
8970   // everything is cool
8971   return true;
8972 }
8973 
8974 
8975 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
8976   if (!VerifyFPU) return;
8977   push_CPU_state();
8978   push(rsp);                // pass CPU state
8979   ExternalAddress msg((address) s);
8980   // pass message string s
8981   pushptr(msg.addr());
8982   push(stack_depth);        // pass stack depth
8983   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
8984   addptr(rsp, 3 * wordSize);   // discard arguments
8985   // check for error
8986   { Label L;
8987     testl(rax, rax);
8988     jcc(Assembler::notZero, L);
8989     int3();                  // break if error condition
8990     bind(L);
8991   }
8992   pop_CPU_state();
8993 }
8994 
8995 void MacroAssembler::load_klass(Register dst, Register src) {
8996 #ifdef _LP64
8997   if (UseCompressedOops) {
8998     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
8999     decode_heap_oop_not_null(dst);
9000   } else
9001 #endif
9002     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
9003 }
9004 
9005 void MacroAssembler::load_prototype_header(Register dst, Register src) {
9006 #ifdef _LP64
9007   if (UseCompressedOops) {
9008     assert (Universe::heap() != NULL, "java heap should be initialized");
9009     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
9010     if (Universe::narrow_oop_shift() != 0) {
9011       assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
9012       if (LogMinObjAlignmentInBytes == Address::times_8) {
9013         movq(dst, Address(r12_heapbase, dst, Address::times_8, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
9014       } else {
9015         // OK to use shift since we don't need to preserve flags.
9016         shlq(dst, LogMinObjAlignmentInBytes);
9017         movq(dst, Address(r12_heapbase, dst, Address::times_1, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
9018       }
9019     } else {
9020       movq(dst, Address(dst, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
9021     }
9022   } else
9023 #endif
9024   {
9025     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
9026     movptr(dst, Address(dst, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
9027   }
9028 }
9029 
9030 void MacroAssembler::store_klass(Register dst, Register src) {
9031 #ifdef _LP64
9032   if (UseCompressedOops) {
9033     encode_heap_oop_not_null(src);
9034     movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
9035   } else
9036 #endif
9037     movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
9038 }
9039 
9040 void MacroAssembler::load_heap_oop(Register dst, Address src) {
9041 #ifdef _LP64
9042   if (UseCompressedOops) {
9043     movl(dst, src);
9044     decode_heap_oop(dst);
9045   } else
9046 #endif
9047     movptr(dst, src);
9048 }
9049 
9050 // Doesn't do verfication, generates fixed size code
9051 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src) {
9052 #ifdef _LP64
9053   if (UseCompressedOops) {
9054     movl(dst, src);
9055     decode_heap_oop_not_null(dst);
9056   } else
9057 #endif
9058     movptr(dst, src);
9059 }
9060 
9061 void MacroAssembler::store_heap_oop(Address dst, Register src) {
9062 #ifdef _LP64
9063   if (UseCompressedOops) {
9064     assert(!dst.uses(src), "not enough registers");
9065     encode_heap_oop(src);
9066     movl(dst, src);
9067   } else
9068 #endif
9069     movptr(dst, src);
9070 }
9071 
9072 // Used for storing NULLs.
9073 void MacroAssembler::store_heap_oop_null(Address dst) {
9074 #ifdef _LP64
9075   if (UseCompressedOops) {
9076     movl(dst, (int32_t)NULL_WORD);
9077   } else {
9078     movslq(dst, (int32_t)NULL_WORD);
9079   }
9080 #else
9081   movl(dst, (int32_t)NULL_WORD);
9082 #endif
9083 }
9084 
9085 #ifdef _LP64
9086 void MacroAssembler::store_klass_gap(Register dst, Register src) {
9087   if (UseCompressedOops) {
9088     // Store to klass gap in destination
9089     movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
9090   }
9091 }
9092 
9093 #ifdef ASSERT
9094 void MacroAssembler::verify_heapbase(const char* msg) {
9095   assert (UseCompressedOops, "should be compressed");
9096   assert (Universe::heap() != NULL, "java heap should be initialized");
9097   if (CheckCompressedOops) {
9098     Label ok;
9099     push(rscratch1); // cmpptr trashes rscratch1
9100     cmpptr(r12_heapbase, ExternalAddress((address)Universe::narrow_oop_base_addr()));
9101     jcc(Assembler::equal, ok);
9102     stop(msg);
9103     bind(ok);
9104     pop(rscratch1);
9105   }
9106 }
9107 #endif
9108 
9109 // Algorithm must match oop.inline.hpp encode_heap_oop.
9110 void MacroAssembler::encode_heap_oop(Register r) {
9111 #ifdef ASSERT
9112   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
9113 #endif
9114   verify_oop(r, "broken oop in encode_heap_oop");
9115   if (Universe::narrow_oop_base() == NULL) {
9116     if (Universe::narrow_oop_shift() != 0) {
9117       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
9118       shrq(r, LogMinObjAlignmentInBytes);
9119     }
9120     return;
9121   }
9122   testq(r, r);
9123   cmovq(Assembler::equal, r, r12_heapbase);
9124   subq(r, r12_heapbase);
9125   shrq(r, LogMinObjAlignmentInBytes);
9126 }
9127 
9128 void MacroAssembler::encode_heap_oop_not_null(Register r) {
9129 #ifdef ASSERT
9130   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
9131   if (CheckCompressedOops) {
9132     Label ok;
9133     testq(r, r);
9134     jcc(Assembler::notEqual, ok);
9135     stop("null oop passed to encode_heap_oop_not_null");
9136     bind(ok);
9137   }
9138 #endif
9139   verify_oop(r, "broken oop in encode_heap_oop_not_null");
9140   if (Universe::narrow_oop_base() != NULL) {
9141     subq(r, r12_heapbase);
9142   }
9143   if (Universe::narrow_oop_shift() != 0) {
9144     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
9145     shrq(r, LogMinObjAlignmentInBytes);
9146   }
9147 }
9148 
9149 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
9150 #ifdef ASSERT
9151   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
9152   if (CheckCompressedOops) {
9153     Label ok;
9154     testq(src, src);
9155     jcc(Assembler::notEqual, ok);
9156     stop("null oop passed to encode_heap_oop_not_null2");
9157     bind(ok);
9158   }
9159 #endif
9160   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
9161   if (dst != src) {
9162     movq(dst, src);
9163   }
9164   if (Universe::narrow_oop_base() != NULL) {
9165     subq(dst, r12_heapbase);
9166   }
9167   if (Universe::narrow_oop_shift() != 0) {
9168     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
9169     shrq(dst, LogMinObjAlignmentInBytes);
9170   }
9171 }
9172 
9173 void  MacroAssembler::decode_heap_oop(Register r) {
9174 #ifdef ASSERT
9175   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
9176 #endif
9177   if (Universe::narrow_oop_base() == NULL) {
9178     if (Universe::narrow_oop_shift() != 0) {
9179       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
9180       shlq(r, LogMinObjAlignmentInBytes);
9181     }
9182   } else {
9183     Label done;
9184     shlq(r, LogMinObjAlignmentInBytes);
9185     jccb(Assembler::equal, done);
9186     addq(r, r12_heapbase);
9187     bind(done);
9188   }
9189   verify_oop(r, "broken oop in decode_heap_oop");
9190 }
9191 
9192 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
9193   // Note: it will change flags
9194   assert (UseCompressedOops, "should only be used for compressed headers");
9195   assert (Universe::heap() != NULL, "java heap should be initialized");
9196   // Cannot assert, unverified entry point counts instructions (see .ad file)
9197   // vtableStubs also counts instructions in pd_code_size_limit.
9198   // Also do not verify_oop as this is called by verify_oop.
9199   if (Universe::narrow_oop_shift() != 0) {
9200     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
9201     shlq(r, LogMinObjAlignmentInBytes);
9202     if (Universe::narrow_oop_base() != NULL) {
9203       addq(r, r12_heapbase);
9204     }
9205   } else {
9206     assert (Universe::narrow_oop_base() == NULL, "sanity");
9207   }
9208 }
9209 
9210 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
9211   // Note: it will change flags
9212   assert (UseCompressedOops, "should only be used for compressed headers");
9213   assert (Universe::heap() != NULL, "java heap should be initialized");
9214   // Cannot assert, unverified entry point counts instructions (see .ad file)
9215   // vtableStubs also counts instructions in pd_code_size_limit.
9216   // Also do not verify_oop as this is called by verify_oop.
9217   if (Universe::narrow_oop_shift() != 0) {
9218     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
9219     if (LogMinObjAlignmentInBytes == Address::times_8) {
9220       leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
9221     } else {
9222       if (dst != src) {
9223         movq(dst, src);
9224       }
9225       shlq(dst, LogMinObjAlignmentInBytes);
9226       if (Universe::narrow_oop_base() != NULL) {
9227         addq(dst, r12_heapbase);
9228       }
9229     }
9230   } else {
9231     assert (Universe::narrow_oop_base() == NULL, "sanity");
9232     if (dst != src) {
9233       movq(dst, src);
9234     }
9235   }
9236 }
9237 
9238 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
9239   assert (UseCompressedOops, "should only be used for compressed headers");
9240   assert (Universe::heap() != NULL, "java heap should be initialized");
9241   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
9242   int oop_index = oop_recorder()->find_index(obj);
9243   RelocationHolder rspec = oop_Relocation::spec(oop_index);
9244   mov_narrow_oop(dst, oop_index, rspec);
9245 }
9246 
9247 void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
9248   assert (UseCompressedOops, "should only be used for compressed headers");
9249   assert (Universe::heap() != NULL, "java heap should be initialized");
9250   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
9251   int oop_index = oop_recorder()->find_index(obj);
9252   RelocationHolder rspec = oop_Relocation::spec(oop_index);
9253   mov_narrow_oop(dst, oop_index, rspec);
9254 }
9255 
9256 void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
9257   assert (UseCompressedOops, "should only be used for compressed headers");
9258   assert (Universe::heap() != NULL, "java heap should be initialized");
9259   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
9260   int oop_index = oop_recorder()->find_index(obj);
9261   RelocationHolder rspec = oop_Relocation::spec(oop_index);
9262   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
9263 }
9264 
9265 void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
9266   assert (UseCompressedOops, "should only be used for compressed headers");
9267   assert (Universe::heap() != NULL, "java heap should be initialized");
9268   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
9269   int oop_index = oop_recorder()->find_index(obj);
9270   RelocationHolder rspec = oop_Relocation::spec(oop_index);
9271   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
9272 }
9273 
9274 void MacroAssembler::reinit_heapbase() {
9275   if (UseCompressedOops) {
9276     movptr(r12_heapbase, ExternalAddress((address)Universe::narrow_oop_base_addr()));
9277   }
9278 }
9279 #endif // _LP64
9280 
9281 // IndexOf for constant substrings with size >= 8 chars
9282 // which don't need to be loaded through stack.
9283 void MacroAssembler::string_indexofC8(Register str1, Register str2,
9284                                       Register cnt1, Register cnt2,
9285                                       int int_cnt2,  Register result,
9286                                       XMMRegister vec, Register tmp) {
9287   assert(UseSSE42Intrinsics, "SSE4.2 is required");
9288 
9289   // This method uses pcmpestri inxtruction with bound registers
9290   //   inputs:
9291   //     xmm - substring
9292   //     rax - substring length (elements count)
9293   //     mem - scanned string
9294   //     rdx - string length (elements count)
9295   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
9296   //   outputs:
9297   //     rcx - matched index in string
9298   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
9299 
9300   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
9301         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
9302         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
9303 
9304   // Note, inline_string_indexOf() generates checks:
9305   // if (substr.count > string.count) return -1;
9306   // if (substr.count == 0) return 0;
9307   assert(int_cnt2 >= 8, "this code isused only for cnt2 >= 8 chars");
9308 
9309   // Load substring.
9310   movdqu(vec, Address(str2, 0));
9311   movl(cnt2, int_cnt2);
9312   movptr(result, str1); // string addr
9313 
9314   if (int_cnt2 > 8) {
9315     jmpb(SCAN_TO_SUBSTR);
9316 
9317     // Reload substr for rescan, this code
9318     // is executed only for large substrings (> 8 chars)
9319     bind(RELOAD_SUBSTR);
9320     movdqu(vec, Address(str2, 0));
9321     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
9322 
9323     bind(RELOAD_STR);
9324     // We came here after the beginning of the substring was
9325     // matched but the rest of it was not so we need to search
9326     // again. Start from the next element after the previous match.
9327 
9328     // cnt2 is number of substring reminding elements and
9329     // cnt1 is number of string reminding elements when cmp failed.
9330     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
9331     subl(cnt1, cnt2);
9332     addl(cnt1, int_cnt2);
9333     movl(cnt2, int_cnt2); // Now restore cnt2
9334 
9335     decrementl(cnt1);     // Shift to next element
9336     cmpl(cnt1, cnt2);
9337     jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
9338 
9339     addptr(result, 2);
9340 
9341   } // (int_cnt2 > 8)
9342 
9343   // Scan string for start of substr in 16-byte vectors
9344   bind(SCAN_TO_SUBSTR);
9345   pcmpestri(vec, Address(result, 0), 0x0d);
9346   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
9347   subl(cnt1, 8);
9348   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
9349   cmpl(cnt1, cnt2);
9350   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
9351   addptr(result, 16);
9352   jmpb(SCAN_TO_SUBSTR);
9353 
9354   // Found a potential substr
9355   bind(FOUND_CANDIDATE);
9356   // Matched whole vector if first element matched (tmp(rcx) == 0).
9357   if (int_cnt2 == 8) {
9358     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
9359   } else { // int_cnt2 > 8
9360     jccb(Assembler::overflow, FOUND_SUBSTR);
9361   }
9362   // After pcmpestri tmp(rcx) contains matched element index
9363   // Compute start addr of substr
9364   lea(result, Address(result, tmp, Address::times_2));
9365 
9366   // Make sure string is still long enough
9367   subl(cnt1, tmp);
9368   cmpl(cnt1, cnt2);
9369   if (int_cnt2 == 8) {
9370     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
9371   } else { // int_cnt2 > 8
9372     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
9373   }
9374   // Left less then substring.
9375 
9376   bind(RET_NOT_FOUND);
9377   movl(result, -1);
9378   jmpb(EXIT);
9379 
9380   if (int_cnt2 > 8) {
9381     // This code is optimized for the case when whole substring
9382     // is matched if its head is matched.
9383     bind(MATCH_SUBSTR_HEAD);
9384     pcmpestri(vec, Address(result, 0), 0x0d);
9385     // Reload only string if does not match
9386     jccb(Assembler::noOverflow, RELOAD_STR); // OF == 0
9387 
9388     Label CONT_SCAN_SUBSTR;
9389     // Compare the rest of substring (> 8 chars).
9390     bind(FOUND_SUBSTR);
9391     // First 8 chars are already matched.
9392     negptr(cnt2);
9393     addptr(cnt2, 8);
9394 
9395     bind(SCAN_SUBSTR);
9396     subl(cnt1, 8);
9397     cmpl(cnt2, -8); // Do not read beyond substring
9398     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
9399     // Back-up strings to avoid reading beyond substring:
9400     // cnt1 = cnt1 - cnt2 + 8
9401     addl(cnt1, cnt2); // cnt2 is negative
9402     addl(cnt1, 8);
9403     movl(cnt2, 8); negptr(cnt2);
9404     bind(CONT_SCAN_SUBSTR);
9405     if (int_cnt2 < (int)G) {
9406       movdqu(vec, Address(str2, cnt2, Address::times_2, int_cnt2*2));
9407       pcmpestri(vec, Address(result, cnt2, Address::times_2, int_cnt2*2), 0x0d);
9408     } else {
9409       // calculate index in register to avoid integer overflow (int_cnt2*2)
9410       movl(tmp, int_cnt2);
9411       addptr(tmp, cnt2);
9412       movdqu(vec, Address(str2, tmp, Address::times_2, 0));
9413       pcmpestri(vec, Address(result, tmp, Address::times_2, 0), 0x0d);
9414     }
9415     // Need to reload strings pointers if not matched whole vector
9416     jccb(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
9417     addptr(cnt2, 8);
9418     jccb(Assembler::negative, SCAN_SUBSTR);
9419     // Fall through if found full substring
9420 
9421   } // (int_cnt2 > 8)
9422 
9423   bind(RET_FOUND);
9424   // Found result if we matched full small substring.
9425   // Compute substr offset
9426   subptr(result, str1);
9427   shrl(result, 1); // index
9428   bind(EXIT);
9429 
9430 } // string_indexofC8
9431 
9432 // Small strings are loaded through stack if they cross page boundary.
9433 void MacroAssembler::string_indexof(Register str1, Register str2,
9434                                     Register cnt1, Register cnt2,
9435                                     int int_cnt2,  Register result,
9436                                     XMMRegister vec, Register tmp) {
9437   assert(UseSSE42Intrinsics, "SSE4.2 is required");
9438   //
9439   // int_cnt2 is length of small (< 8 chars) constant substring
9440   // or (-1) for non constant substring in which case its length
9441   // is in cnt2 register.
9442   //
9443   // Note, inline_string_indexOf() generates checks:
9444   // if (substr.count > string.count) return -1;
9445   // if (substr.count == 0) return 0;
9446   //
9447   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < 8), "should be != 0");
9448 
9449   // This method uses pcmpestri inxtruction with bound registers
9450   //   inputs:
9451   //     xmm - substring
9452   //     rax - substring length (elements count)
9453   //     mem - scanned string
9454   //     rdx - string length (elements count)
9455   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
9456   //   outputs:
9457   //     rcx - matched index in string
9458   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
9459 
9460   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
9461         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
9462         FOUND_CANDIDATE;
9463 
9464   { //========================================================
9465     // We don't know where these strings are located
9466     // and we can't read beyond them. Load them through stack.
9467     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
9468 
9469     movptr(tmp, rsp); // save old SP
9470 
9471     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
9472       if (int_cnt2 == 1) {  // One char
9473         load_unsigned_short(result, Address(str2, 0));
9474         movdl(vec, result); // move 32 bits
9475       } else if (int_cnt2 == 2) { // Two chars
9476         movdl(vec, Address(str2, 0)); // move 32 bits
9477       } else if (int_cnt2 == 4) { // Four chars
9478         movq(vec, Address(str2, 0));  // move 64 bits
9479       } else { // cnt2 = { 3, 5, 6, 7 }
9480         // Array header size is 12 bytes in 32-bit VM
9481         // + 6 bytes for 3 chars == 18 bytes,
9482         // enough space to load vec and shift.
9483         assert(HeapWordSize*typeArrayKlass::header_size() >= 12,"sanity");
9484         movdqu(vec, Address(str2, (int_cnt2*2)-16));
9485         psrldq(vec, 16-(int_cnt2*2));
9486       }
9487     } else { // not constant substring
9488       cmpl(cnt2, 8);
9489       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
9490 
9491       // We can read beyond string if srt+16 does not cross page boundary
9492       // since heaps are aligned and mapped by pages.
9493       assert(os::vm_page_size() < (int)G, "default page should be small");
9494       movl(result, str2); // We need only low 32 bits
9495       andl(result, (os::vm_page_size()-1));
9496       cmpl(result, (os::vm_page_size()-16));
9497       jccb(Assembler::belowEqual, CHECK_STR);
9498 
9499       // Move small strings to stack to allow load 16 bytes into vec.
9500       subptr(rsp, 16);
9501       int stk_offset = wordSize-2;
9502       push(cnt2);
9503 
9504       bind(COPY_SUBSTR);
9505       load_unsigned_short(result, Address(str2, cnt2, Address::times_2, -2));
9506       movw(Address(rsp, cnt2, Address::times_2, stk_offset), result);
9507       decrement(cnt2);
9508       jccb(Assembler::notZero, COPY_SUBSTR);
9509 
9510       pop(cnt2);
9511       movptr(str2, rsp);  // New substring address
9512     } // non constant
9513 
9514     bind(CHECK_STR);
9515     cmpl(cnt1, 8);
9516     jccb(Assembler::aboveEqual, BIG_STRINGS);
9517 
9518     // Check cross page boundary.
9519     movl(result, str1); // We need only low 32 bits
9520     andl(result, (os::vm_page_size()-1));
9521     cmpl(result, (os::vm_page_size()-16));
9522     jccb(Assembler::belowEqual, BIG_STRINGS);
9523 
9524     subptr(rsp, 16);
9525     int stk_offset = -2;
9526     if (int_cnt2 < 0) { // not constant
9527       push(cnt2);
9528       stk_offset += wordSize;
9529     }
9530     movl(cnt2, cnt1);
9531 
9532     bind(COPY_STR);
9533     load_unsigned_short(result, Address(str1, cnt2, Address::times_2, -2));
9534     movw(Address(rsp, cnt2, Address::times_2, stk_offset), result);
9535     decrement(cnt2);
9536     jccb(Assembler::notZero, COPY_STR);
9537 
9538     if (int_cnt2 < 0) { // not constant
9539       pop(cnt2);
9540     }
9541     movptr(str1, rsp);  // New string address
9542 
9543     bind(BIG_STRINGS);
9544     // Load substring.
9545     if (int_cnt2 < 0) { // -1
9546       movdqu(vec, Address(str2, 0));
9547       push(cnt2);       // substr count
9548       push(str2);       // substr addr
9549       push(str1);       // string addr
9550     } else {
9551       // Small (< 8 chars) constant substrings are loaded already.
9552       movl(cnt2, int_cnt2);
9553     }
9554     push(tmp);  // original SP
9555 
9556   } // Finished loading
9557 
9558   //========================================================
9559   // Start search
9560   //
9561 
9562   movptr(result, str1); // string addr
9563 
9564   if (int_cnt2  < 0) {  // Only for non constant substring
9565     jmpb(SCAN_TO_SUBSTR);
9566 
9567     // SP saved at sp+0
9568     // String saved at sp+1*wordSize
9569     // Substr saved at sp+2*wordSize
9570     // Substr count saved at sp+3*wordSize
9571 
9572     // Reload substr for rescan, this code
9573     // is executed only for large substrings (> 8 chars)
9574     bind(RELOAD_SUBSTR);
9575     movptr(str2, Address(rsp, 2*wordSize));
9576     movl(cnt2, Address(rsp, 3*wordSize));
9577     movdqu(vec, Address(str2, 0));
9578     // We came here after the beginning of the substring was
9579     // matched but the rest of it was not so we need to search
9580     // again. Start from the next element after the previous match.
9581     subptr(str1, result); // Restore counter
9582     shrl(str1, 1);
9583     addl(cnt1, str1);
9584     decrementl(cnt1);   // Shift to next element
9585     cmpl(cnt1, cnt2);
9586     jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
9587 
9588     addptr(result, 2);
9589   } // non constant
9590 
9591   // Scan string for start of substr in 16-byte vectors
9592   bind(SCAN_TO_SUBSTR);
9593   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
9594   pcmpestri(vec, Address(result, 0), 0x0d);
9595   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
9596   subl(cnt1, 8);
9597   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
9598   cmpl(cnt1, cnt2);
9599   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
9600   addptr(result, 16);
9601 
9602   bind(ADJUST_STR);
9603   cmpl(cnt1, 8); // Do not read beyond string
9604   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
9605   // Back-up string to avoid reading beyond string.
9606   lea(result, Address(result, cnt1, Address::times_2, -16));
9607   movl(cnt1, 8);
9608   jmpb(SCAN_TO_SUBSTR);
9609 
9610   // Found a potential substr
9611   bind(FOUND_CANDIDATE);
9612   // After pcmpestri tmp(rcx) contains matched element index
9613 
9614   // Make sure string is still long enough
9615   subl(cnt1, tmp);
9616   cmpl(cnt1, cnt2);
9617   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
9618   // Left less then substring.
9619 
9620   bind(RET_NOT_FOUND);
9621   movl(result, -1);
9622   jmpb(CLEANUP);
9623 
9624   bind(FOUND_SUBSTR);
9625   // Compute start addr of substr
9626   lea(result, Address(result, tmp, Address::times_2));
9627 
9628   if (int_cnt2 > 0) { // Constant substring
9629     // Repeat search for small substring (< 8 chars)
9630     // from new point without reloading substring.
9631     // Have to check that we don't read beyond string.
9632     cmpl(tmp, 8-int_cnt2);
9633     jccb(Assembler::greater, ADJUST_STR);
9634     // Fall through if matched whole substring.
9635   } else { // non constant
9636     assert(int_cnt2 == -1, "should be != 0");
9637 
9638     addl(tmp, cnt2);
9639     // Found result if we matched whole substring.
9640     cmpl(tmp, 8);
9641     jccb(Assembler::lessEqual, RET_FOUND);
9642 
9643     // Repeat search for small substring (<= 8 chars)
9644     // from new point 'str1' without reloading substring.
9645     cmpl(cnt2, 8);
9646     // Have to check that we don't read beyond string.
9647     jccb(Assembler::lessEqual, ADJUST_STR);
9648 
9649     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
9650     // Compare the rest of substring (> 8 chars).
9651     movptr(str1, result);
9652 
9653     cmpl(tmp, cnt2);
9654     // First 8 chars are already matched.
9655     jccb(Assembler::equal, CHECK_NEXT);
9656 
9657     bind(SCAN_SUBSTR);
9658     pcmpestri(vec, Address(str1, 0), 0x0d);
9659     // Need to reload strings pointers if not matched whole vector
9660     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
9661 
9662     bind(CHECK_NEXT);
9663     subl(cnt2, 8);
9664     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
9665     addptr(str1, 16);
9666     addptr(str2, 16);
9667     subl(cnt1, 8);
9668     cmpl(cnt2, 8); // Do not read beyond substring
9669     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
9670     // Back-up strings to avoid reading beyond substring.
9671     lea(str2, Address(str2, cnt2, Address::times_2, -16));
9672     lea(str1, Address(str1, cnt2, Address::times_2, -16));
9673     subl(cnt1, cnt2);
9674     movl(cnt2, 8);
9675     addl(cnt1, 8);
9676     bind(CONT_SCAN_SUBSTR);
9677     movdqu(vec, Address(str2, 0));
9678     jmpb(SCAN_SUBSTR);
9679 
9680     bind(RET_FOUND_LONG);
9681     movptr(str1, Address(rsp, wordSize));
9682   } // non constant
9683 
9684   bind(RET_FOUND);
9685   // Compute substr offset
9686   subptr(result, str1);
9687   shrl(result, 1); // index
9688 
9689   bind(CLEANUP);
9690   pop(rsp); // restore SP
9691 
9692 } // string_indexof
9693 
9694 // Compare strings.
9695 void MacroAssembler::string_compare(Register str1, Register str2,
9696                                     Register cnt1, Register cnt2, Register result,
9697                                     XMMRegister vec1) {
9698   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
9699 
9700   // Compute the minimum of the string lengths and the
9701   // difference of the string lengths (stack).
9702   // Do the conditional move stuff
9703   movl(result, cnt1);
9704   subl(cnt1, cnt2);
9705   push(cnt1);
9706   cmov32(Assembler::lessEqual, cnt2, result);
9707 
9708   // Is the minimum length zero?
9709   testl(cnt2, cnt2);
9710   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
9711 
9712   // Load first characters
9713   load_unsigned_short(result, Address(str1, 0));
9714   load_unsigned_short(cnt1, Address(str2, 0));
9715 
9716   // Compare first characters
9717   subl(result, cnt1);
9718   jcc(Assembler::notZero,  POP_LABEL);
9719   decrementl(cnt2);
9720   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
9721 
9722   {
9723     // Check after comparing first character to see if strings are equivalent
9724     Label LSkip2;
9725     // Check if the strings start at same location
9726     cmpptr(str1, str2);
9727     jccb(Assembler::notEqual, LSkip2);
9728 
9729     // Check if the length difference is zero (from stack)
9730     cmpl(Address(rsp, 0), 0x0);
9731     jcc(Assembler::equal,  LENGTH_DIFF_LABEL);
9732 
9733     // Strings might not be equivalent
9734     bind(LSkip2);
9735   }
9736 
9737   Address::ScaleFactor scale = Address::times_2;
9738   int stride = 8;
9739 
9740   // Advance to next element
9741   addptr(str1, 16/stride);
9742   addptr(str2, 16/stride);
9743 
9744   if (UseSSE42Intrinsics) {
9745     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
9746     int pcmpmask = 0x19;
9747     // Setup to compare 16-byte vectors
9748     movl(result, cnt2);
9749     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
9750     jccb(Assembler::zero, COMPARE_TAIL);
9751 
9752     lea(str1, Address(str1, result, scale));
9753     lea(str2, Address(str2, result, scale));
9754     negptr(result);
9755 
9756     // pcmpestri
9757     //   inputs:
9758     //     vec1- substring
9759     //     rax - negative string length (elements count)
9760     //     mem - scaned string
9761     //     rdx - string length (elements count)
9762     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
9763     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
9764     //   outputs:
9765     //     rcx - first mismatched element index
9766     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
9767 
9768     bind(COMPARE_WIDE_VECTORS);
9769     movdqu(vec1, Address(str1, result, scale));
9770     pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
9771     // After pcmpestri cnt1(rcx) contains mismatched element index
9772 
9773     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
9774     addptr(result, stride);
9775     subptr(cnt2, stride);
9776     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
9777 
9778     // compare wide vectors tail
9779     testl(result, result);
9780     jccb(Assembler::zero, LENGTH_DIFF_LABEL);
9781 
9782     movl(cnt2, stride);
9783     movl(result, stride);
9784     negptr(result);
9785     movdqu(vec1, Address(str1, result, scale));
9786     pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
9787     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
9788 
9789     // Mismatched characters in the vectors
9790     bind(VECTOR_NOT_EQUAL);
9791     addptr(result, cnt1);
9792     movptr(cnt2, result);
9793     load_unsigned_short(result, Address(str1, cnt2, scale));
9794     load_unsigned_short(cnt1, Address(str2, cnt2, scale));
9795     subl(result, cnt1);
9796     jmpb(POP_LABEL);
9797 
9798     bind(COMPARE_TAIL); // limit is zero
9799     movl(cnt2, result);
9800     // Fallthru to tail compare
9801   }
9802 
9803   // Shift str2 and str1 to the end of the arrays, negate min
9804   lea(str1, Address(str1, cnt2, scale, 0));
9805   lea(str2, Address(str2, cnt2, scale, 0));
9806   negptr(cnt2);
9807 
9808   // Compare the rest of the elements
9809   bind(WHILE_HEAD_LABEL);
9810   load_unsigned_short(result, Address(str1, cnt2, scale, 0));
9811   load_unsigned_short(cnt1, Address(str2, cnt2, scale, 0));
9812   subl(result, cnt1);
9813   jccb(Assembler::notZero, POP_LABEL);
9814   increment(cnt2);
9815   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
9816 
9817   // Strings are equal up to min length.  Return the length difference.
9818   bind(LENGTH_DIFF_LABEL);
9819   pop(result);
9820   jmpb(DONE_LABEL);
9821 
9822   // Discard the stored length difference
9823   bind(POP_LABEL);
9824   pop(cnt1);
9825 
9826   // That's it
9827   bind(DONE_LABEL);
9828 }
9829 
9830 // Compare char[] arrays aligned to 4 bytes or substrings.
9831 void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Register ary2,
9832                                         Register limit, Register result, Register chr,
9833                                         XMMRegister vec1, XMMRegister vec2) {
9834   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR;
9835 
9836   int length_offset  = arrayOopDesc::length_offset_in_bytes();
9837   int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
9838 
9839   // Check the input args
9840   cmpptr(ary1, ary2);
9841   jcc(Assembler::equal, TRUE_LABEL);
9842 
9843   if (is_array_equ) {
9844     // Need additional checks for arrays_equals.
9845     testptr(ary1, ary1);
9846     jcc(Assembler::zero, FALSE_LABEL);
9847     testptr(ary2, ary2);
9848     jcc(Assembler::zero, FALSE_LABEL);
9849 
9850     // Check the lengths
9851     movl(limit, Address(ary1, length_offset));
9852     cmpl(limit, Address(ary2, length_offset));
9853     jcc(Assembler::notEqual, FALSE_LABEL);
9854   }
9855 
9856   // count == 0
9857   testl(limit, limit);
9858   jcc(Assembler::zero, TRUE_LABEL);
9859 
9860   if (is_array_equ) {
9861     // Load array address
9862     lea(ary1, Address(ary1, base_offset));
9863     lea(ary2, Address(ary2, base_offset));
9864   }
9865 
9866   shll(limit, 1);      // byte count != 0
9867   movl(result, limit); // copy
9868 
9869   if (UseSSE42Intrinsics) {
9870     // With SSE4.2, use double quad vector compare
9871     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
9872 
9873     // Compare 16-byte vectors
9874     andl(result, 0x0000000e);  //   tail count (in bytes)
9875     andl(limit, 0xfffffff0);   // vector count (in bytes)
9876     jccb(Assembler::zero, COMPARE_TAIL);
9877 
9878     lea(ary1, Address(ary1, limit, Address::times_1));
9879     lea(ary2, Address(ary2, limit, Address::times_1));
9880     negptr(limit);
9881 
9882     bind(COMPARE_WIDE_VECTORS);
9883     movdqu(vec1, Address(ary1, limit, Address::times_1));
9884     movdqu(vec2, Address(ary2, limit, Address::times_1));
9885     pxor(vec1, vec2);
9886 
9887     ptest(vec1, vec1);
9888     jccb(Assembler::notZero, FALSE_LABEL);
9889     addptr(limit, 16);
9890     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
9891 
9892     testl(result, result);
9893     jccb(Assembler::zero, TRUE_LABEL);
9894 
9895     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
9896     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
9897     pxor(vec1, vec2);
9898 
9899     ptest(vec1, vec1);
9900     jccb(Assembler::notZero, FALSE_LABEL);
9901     jmpb(TRUE_LABEL);
9902 
9903     bind(COMPARE_TAIL); // limit is zero
9904     movl(limit, result);
9905     // Fallthru to tail compare
9906   }
9907 
9908   // Compare 4-byte vectors
9909   andl(limit, 0xfffffffc); // vector count (in bytes)
9910   jccb(Assembler::zero, COMPARE_CHAR);
9911 
9912   lea(ary1, Address(ary1, limit, Address::times_1));
9913   lea(ary2, Address(ary2, limit, Address::times_1));
9914   negptr(limit);
9915 
9916   bind(COMPARE_VECTORS);
9917   movl(chr, Address(ary1, limit, Address::times_1));
9918   cmpl(chr, Address(ary2, limit, Address::times_1));
9919   jccb(Assembler::notEqual, FALSE_LABEL);
9920   addptr(limit, 4);
9921   jcc(Assembler::notZero, COMPARE_VECTORS);
9922 
9923   // Compare trailing char (final 2 bytes), if any
9924   bind(COMPARE_CHAR);
9925   testl(result, 0x2);   // tail  char
9926   jccb(Assembler::zero, TRUE_LABEL);
9927   load_unsigned_short(chr, Address(ary1, 0));
9928   load_unsigned_short(limit, Address(ary2, 0));
9929   cmpl(chr, limit);
9930   jccb(Assembler::notEqual, FALSE_LABEL);
9931 
9932   bind(TRUE_LABEL);
9933   movl(result, 1);   // return true
9934   jmpb(DONE);
9935 
9936   bind(FALSE_LABEL);
9937   xorl(result, result); // return false
9938 
9939   // That's it
9940   bind(DONE);
9941 }
9942 
9943 #ifdef PRODUCT
9944 #define BLOCK_COMMENT(str) /* nothing */
9945 #else
9946 #define BLOCK_COMMENT(str) block_comment(str)
9947 #endif
9948 
9949 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
9950 void MacroAssembler::generate_fill(BasicType t, bool aligned,
9951                                    Register to, Register value, Register count,
9952                                    Register rtmp, XMMRegister xtmp) {
9953   assert_different_registers(to, value, count, rtmp);
9954   Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
9955   Label L_fill_2_bytes, L_fill_4_bytes;
9956 
9957   int shift = -1;
9958   switch (t) {
9959     case T_BYTE:
9960       shift = 2;
9961       break;
9962     case T_SHORT:
9963       shift = 1;
9964       break;
9965     case T_INT:
9966       shift = 0;
9967       break;
9968     default: ShouldNotReachHere();
9969   }
9970 
9971   if (t == T_BYTE) {
9972     andl(value, 0xff);
9973     movl(rtmp, value);
9974     shll(rtmp, 8);
9975     orl(value, rtmp);
9976   }
9977   if (t == T_SHORT) {
9978     andl(value, 0xffff);
9979   }
9980   if (t == T_BYTE || t == T_SHORT) {
9981     movl(rtmp, value);
9982     shll(rtmp, 16);
9983     orl(value, rtmp);
9984   }
9985 
9986   cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
9987   jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
9988   if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
9989     // align source address at 4 bytes address boundary
9990     if (t == T_BYTE) {
9991       // One byte misalignment happens only for byte arrays
9992       testptr(to, 1);
9993       jccb(Assembler::zero, L_skip_align1);
9994       movb(Address(to, 0), value);
9995       increment(to);
9996       decrement(count);
9997       BIND(L_skip_align1);
9998     }
9999     // Two bytes misalignment happens only for byte and short (char) arrays
10000     testptr(to, 2);
10001     jccb(Assembler::zero, L_skip_align2);
10002     movw(Address(to, 0), value);
10003     addptr(to, 2);
10004     subl(count, 1<<(shift-1));
10005     BIND(L_skip_align2);
10006   }
10007   if (UseSSE < 2) {
10008     Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
10009     // Fill 32-byte chunks
10010     subl(count, 8 << shift);
10011     jcc(Assembler::less, L_check_fill_8_bytes);
10012     align(16);
10013 
10014     BIND(L_fill_32_bytes_loop);
10015 
10016     for (int i = 0; i < 32; i += 4) {
10017       movl(Address(to, i), value);
10018     }
10019 
10020     addptr(to, 32);
10021     subl(count, 8 << shift);
10022     jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
10023     BIND(L_check_fill_8_bytes);
10024     addl(count, 8 << shift);
10025     jccb(Assembler::zero, L_exit);
10026     jmpb(L_fill_8_bytes);
10027 
10028     //
10029     // length is too short, just fill qwords
10030     //
10031     BIND(L_fill_8_bytes_loop);
10032     movl(Address(to, 0), value);
10033     movl(Address(to, 4), value);
10034     addptr(to, 8);
10035     BIND(L_fill_8_bytes);
10036     subl(count, 1 << (shift + 1));
10037     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
10038     // fall through to fill 4 bytes
10039   } else {
10040     Label L_fill_32_bytes;
10041     if (!UseUnalignedLoadStores) {
10042       // align to 8 bytes, we know we are 4 byte aligned to start
10043       testptr(to, 4);
10044       jccb(Assembler::zero, L_fill_32_bytes);
10045       movl(Address(to, 0), value);
10046       addptr(to, 4);
10047       subl(count, 1<<shift);
10048     }
10049     BIND(L_fill_32_bytes);
10050     {
10051       assert( UseSSE >= 2, "supported cpu only" );
10052       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
10053       // Fill 32-byte chunks
10054       movdl(xtmp, value);
10055       pshufd(xtmp, xtmp, 0);
10056 
10057       subl(count, 8 << shift);
10058       jcc(Assembler::less, L_check_fill_8_bytes);
10059       align(16);
10060 
10061       BIND(L_fill_32_bytes_loop);
10062 
10063       if (UseUnalignedLoadStores) {
10064         movdqu(Address(to, 0), xtmp);
10065         movdqu(Address(to, 16), xtmp);
10066       } else {
10067         movq(Address(to, 0), xtmp);
10068         movq(Address(to, 8), xtmp);
10069         movq(Address(to, 16), xtmp);
10070         movq(Address(to, 24), xtmp);
10071       }
10072 
10073       addptr(to, 32);
10074       subl(count, 8 << shift);
10075       jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
10076       BIND(L_check_fill_8_bytes);
10077       addl(count, 8 << shift);
10078       jccb(Assembler::zero, L_exit);
10079       jmpb(L_fill_8_bytes);
10080 
10081       //
10082       // length is too short, just fill qwords
10083       //
10084       BIND(L_fill_8_bytes_loop);
10085       movq(Address(to, 0), xtmp);
10086       addptr(to, 8);
10087       BIND(L_fill_8_bytes);
10088       subl(count, 1 << (shift + 1));
10089       jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
10090     }
10091   }
10092   // fill trailing 4 bytes
10093   BIND(L_fill_4_bytes);
10094   testl(count, 1<<shift);
10095   jccb(Assembler::zero, L_fill_2_bytes);
10096   movl(Address(to, 0), value);
10097   if (t == T_BYTE || t == T_SHORT) {
10098     addptr(to, 4);
10099     BIND(L_fill_2_bytes);
10100     // fill trailing 2 bytes
10101     testl(count, 1<<(shift-1));
10102     jccb(Assembler::zero, L_fill_byte);
10103     movw(Address(to, 0), value);
10104     if (t == T_BYTE) {
10105       addptr(to, 2);
10106       BIND(L_fill_byte);
10107       // fill trailing byte
10108       testl(count, 1);
10109       jccb(Assembler::zero, L_exit);
10110       movb(Address(to, 0), value);
10111     } else {
10112       BIND(L_fill_byte);
10113     }
10114   } else {
10115     BIND(L_fill_2_bytes);
10116   }
10117   BIND(L_exit);
10118 }
10119 #undef BIND
10120 #undef BLOCK_COMMENT
10121 
10122 
10123 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
10124   switch (cond) {
10125     // Note some conditions are synonyms for others
10126     case Assembler::zero:         return Assembler::notZero;
10127     case Assembler::notZero:      return Assembler::zero;
10128     case Assembler::less:         return Assembler::greaterEqual;
10129     case Assembler::lessEqual:    return Assembler::greater;
10130     case Assembler::greater:      return Assembler::lessEqual;
10131     case Assembler::greaterEqual: return Assembler::less;
10132     case Assembler::below:        return Assembler::aboveEqual;
10133     case Assembler::belowEqual:   return Assembler::above;
10134     case Assembler::above:        return Assembler::belowEqual;
10135     case Assembler::aboveEqual:   return Assembler::below;
10136     case Assembler::overflow:     return Assembler::noOverflow;
10137     case Assembler::noOverflow:   return Assembler::overflow;
10138     case Assembler::negative:     return Assembler::positive;
10139     case Assembler::positive:     return Assembler::negative;
10140     case Assembler::parity:       return Assembler::noParity;
10141     case Assembler::noParity:     return Assembler::parity;
10142   }
10143   ShouldNotReachHere(); return Assembler::overflow;
10144 }
10145 
10146 SkipIfEqual::SkipIfEqual(
10147     MacroAssembler* masm, const bool* flag_addr, bool value) {
10148   _masm = masm;
10149   _masm->cmp8(ExternalAddress((address)flag_addr), value);
10150   _masm->jcc(Assembler::equal, _label);
10151 }
10152 
10153 SkipIfEqual::~SkipIfEqual() {
10154   _masm->bind(_label);
10155 }