1 /*
   2  * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "assembler_x86.inline.hpp"
  27 #include "gc_interface/collectedHeap.inline.hpp"
  28 #include "interpreter/interpreter.hpp"
  29 #include "memory/cardTableModRefBS.hpp"
  30 #include "memory/resourceArea.hpp"
  31 #include "prims/methodHandles.hpp"
  32 #include "runtime/biasedLocking.hpp"
  33 #include "runtime/interfaceSupport.hpp"
  34 #include "runtime/objectMonitor.hpp"
  35 #include "runtime/os.hpp"
  36 #include "runtime/sharedRuntime.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #ifndef SERIALGC
  39 #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
  40 #include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
  41 #include "gc_implementation/g1/heapRegion.hpp"
  42 #endif
  43 
  44 // Implementation of AddressLiteral
  45 
  46 AddressLiteral::AddressLiteral(address target, relocInfo::relocType rtype) {
  47   _is_lval = false;
  48   _target = target;
  49   switch (rtype) {
  50   case relocInfo::oop_type:
  51     // Oops are a special case. Normally they would be their own section
  52     // but in cases like icBuffer they are literals in the code stream that
  53     // we don't have a section for. We use none so that we get a literal address
  54     // which is always patchable.
  55     break;
  56   case relocInfo::external_word_type:
  57     _rspec = external_word_Relocation::spec(target);
  58     break;
  59   case relocInfo::internal_word_type:
  60     _rspec = internal_word_Relocation::spec(target);
  61     break;
  62   case relocInfo::opt_virtual_call_type:
  63     _rspec = opt_virtual_call_Relocation::spec();
  64     break;
  65   case relocInfo::static_call_type:
  66     _rspec = static_call_Relocation::spec();
  67     break;
  68   case relocInfo::runtime_call_type:
  69     _rspec = runtime_call_Relocation::spec();
  70     break;
  71   case relocInfo::poll_type:
  72   case relocInfo::poll_return_type:
  73     _rspec = Relocation::spec_simple(rtype);
  74     break;
  75   case relocInfo::none:
  76     break;
  77   default:
  78     ShouldNotReachHere();
  79     break;
  80   }
  81 }
  82 
  83 // Implementation of Address
  84 
  85 #ifdef _LP64
  86 
  87 Address Address::make_array(ArrayAddress adr) {
  88   // Not implementable on 64bit machines
  89   // Should have been handled higher up the call chain.
  90   ShouldNotReachHere();
  91   return Address();
  92 }
  93 
  94 // exceedingly dangerous constructor
  95 Address::Address(int disp, address loc, relocInfo::relocType rtype) {
  96   _base  = noreg;
  97   _index = noreg;
  98   _scale = no_scale;
  99   _disp  = disp;
 100   switch (rtype) {
 101     case relocInfo::external_word_type:
 102       _rspec = external_word_Relocation::spec(loc);
 103       break;
 104     case relocInfo::internal_word_type:
 105       _rspec = internal_word_Relocation::spec(loc);
 106       break;
 107     case relocInfo::runtime_call_type:
 108       // HMM
 109       _rspec = runtime_call_Relocation::spec();
 110       break;
 111     case relocInfo::poll_type:
 112     case relocInfo::poll_return_type:
 113       _rspec = Relocation::spec_simple(rtype);
 114       break;
 115     case relocInfo::none:
 116       break;
 117     default:
 118       ShouldNotReachHere();
 119   }
 120 }
 121 #else // LP64
 122 
 123 Address Address::make_array(ArrayAddress adr) {
 124   AddressLiteral base = adr.base();
 125   Address index = adr.index();
 126   assert(index._disp == 0, "must not have disp"); // maybe it can?
 127   Address array(index._base, index._index, index._scale, (intptr_t) base.target());
 128   array._rspec = base._rspec;
 129   return array;
 130 }
 131 
 132 // exceedingly dangerous constructor
 133 Address::Address(address loc, RelocationHolder spec) {
 134   _base  = noreg;
 135   _index = noreg;
 136   _scale = no_scale;
 137   _disp  = (intptr_t) loc;
 138   _rspec = spec;
 139 }
 140 
 141 #endif // _LP64
 142 
 143 
 144 
 145 // Convert the raw encoding form into the form expected by the constructor for
 146 // Address.  An index of 4 (rsp) corresponds to having no index, so convert
 147 // that to noreg for the Address constructor.
 148 Address Address::make_raw(int base, int index, int scale, int disp, bool disp_is_oop) {
 149   RelocationHolder rspec;
 150   if (disp_is_oop) {
 151     rspec = Relocation::spec_simple(relocInfo::oop_type);
 152   }
 153   bool valid_index = index != rsp->encoding();
 154   if (valid_index) {
 155     Address madr(as_Register(base), as_Register(index), (Address::ScaleFactor)scale, in_ByteSize(disp));
 156     madr._rspec = rspec;
 157     return madr;
 158   } else {
 159     Address madr(as_Register(base), noreg, Address::no_scale, in_ByteSize(disp));
 160     madr._rspec = rspec;
 161     return madr;
 162   }
 163 }
 164 
 165 // Implementation of Assembler
 166 
 167 int AbstractAssembler::code_fill_byte() {
 168   return (u_char)'\xF4'; // hlt
 169 }
 170 
 171 // make this go away someday
 172 void Assembler::emit_data(jint data, relocInfo::relocType rtype, int format) {
 173   if (rtype == relocInfo::none)
 174         emit_long(data);
 175   else  emit_data(data, Relocation::spec_simple(rtype), format);
 176 }
 177 
 178 void Assembler::emit_data(jint data, RelocationHolder const& rspec, int format) {
 179   assert(imm_operand == 0, "default format must be immediate in this file");
 180   assert(inst_mark() != NULL, "must be inside InstructionMark");
 181   if (rspec.type() !=  relocInfo::none) {
 182     #ifdef ASSERT
 183       check_relocation(rspec, format);
 184     #endif
 185     // Do not use AbstractAssembler::relocate, which is not intended for
 186     // embedded words.  Instead, relocate to the enclosing instruction.
 187 
 188     // hack. call32 is too wide for mask so use disp32
 189     if (format == call32_operand)
 190       code_section()->relocate(inst_mark(), rspec, disp32_operand);
 191     else
 192       code_section()->relocate(inst_mark(), rspec, format);
 193   }
 194   emit_long(data);
 195 }
 196 
 197 static int encode(Register r) {
 198   int enc = r->encoding();
 199   if (enc >= 8) {
 200     enc -= 8;
 201   }
 202   return enc;
 203 }
 204 
 205 static int encode(XMMRegister r) {
 206   int enc = r->encoding();
 207   if (enc >= 8) {
 208     enc -= 8;
 209   }
 210   return enc;
 211 }
 212 
 213 void Assembler::emit_arith_b(int op1, int op2, Register dst, int imm8) {
 214   assert(dst->has_byte_register(), "must have byte register");
 215   assert(isByte(op1) && isByte(op2), "wrong opcode");
 216   assert(isByte(imm8), "not a byte");
 217   assert((op1 & 0x01) == 0, "should be 8bit operation");
 218   emit_byte(op1);
 219   emit_byte(op2 | encode(dst));
 220   emit_byte(imm8);
 221 }
 222 
 223 
 224 void Assembler::emit_arith(int op1, int op2, Register dst, int32_t imm32) {
 225   assert(isByte(op1) && isByte(op2), "wrong opcode");
 226   assert((op1 & 0x01) == 1, "should be 32bit operation");
 227   assert((op1 & 0x02) == 0, "sign-extension bit should not be set");
 228   if (is8bit(imm32)) {
 229     emit_byte(op1 | 0x02); // set sign bit
 230     emit_byte(op2 | encode(dst));
 231     emit_byte(imm32 & 0xFF);
 232   } else {
 233     emit_byte(op1);
 234     emit_byte(op2 | encode(dst));
 235     emit_long(imm32);
 236   }
 237 }
 238 
 239 // immediate-to-memory forms
 240 void Assembler::emit_arith_operand(int op1, Register rm, Address adr, int32_t imm32) {
 241   assert((op1 & 0x01) == 1, "should be 32bit operation");
 242   assert((op1 & 0x02) == 0, "sign-extension bit should not be set");
 243   if (is8bit(imm32)) {
 244     emit_byte(op1 | 0x02); // set sign bit
 245     emit_operand(rm, adr, 1);
 246     emit_byte(imm32 & 0xFF);
 247   } else {
 248     emit_byte(op1);
 249     emit_operand(rm, adr, 4);
 250     emit_long(imm32);
 251   }
 252 }
 253 
 254 void Assembler::emit_arith(int op1, int op2, Register dst, jobject obj) {
 255   LP64_ONLY(ShouldNotReachHere());
 256   assert(isByte(op1) && isByte(op2), "wrong opcode");
 257   assert((op1 & 0x01) == 1, "should be 32bit operation");
 258   assert((op1 & 0x02) == 0, "sign-extension bit should not be set");
 259   InstructionMark im(this);
 260   emit_byte(op1);
 261   emit_byte(op2 | encode(dst));
 262   emit_data((intptr_t)obj, relocInfo::oop_type, 0);
 263 }
 264 
 265 
 266 void Assembler::emit_arith(int op1, int op2, Register dst, Register src) {
 267   assert(isByte(op1) && isByte(op2), "wrong opcode");
 268   emit_byte(op1);
 269   emit_byte(op2 | encode(dst) << 3 | encode(src));
 270 }
 271 
 272 
 273 void Assembler::emit_operand(Register reg, Register base, Register index,
 274                              Address::ScaleFactor scale, int disp,
 275                              RelocationHolder const& rspec,
 276                              int rip_relative_correction) {
 277   relocInfo::relocType rtype = (relocInfo::relocType) rspec.type();
 278 
 279   // Encode the registers as needed in the fields they are used in
 280 
 281   int regenc = encode(reg) << 3;
 282   int indexenc = index->is_valid() ? encode(index) << 3 : 0;
 283   int baseenc = base->is_valid() ? encode(base) : 0;
 284 
 285   if (base->is_valid()) {
 286     if (index->is_valid()) {
 287       assert(scale != Address::no_scale, "inconsistent address");
 288       // [base + index*scale + disp]
 289       if (disp == 0 && rtype == relocInfo::none  &&
 290           base != rbp LP64_ONLY(&& base != r13)) {
 291         // [base + index*scale]
 292         // [00 reg 100][ss index base]
 293         assert(index != rsp, "illegal addressing mode");
 294         emit_byte(0x04 | regenc);
 295         emit_byte(scale << 6 | indexenc | baseenc);
 296       } else if (is8bit(disp) && rtype == relocInfo::none) {
 297         // [base + index*scale + imm8]
 298         // [01 reg 100][ss index base] imm8
 299         assert(index != rsp, "illegal addressing mode");
 300         emit_byte(0x44 | regenc);
 301         emit_byte(scale << 6 | indexenc | baseenc);
 302         emit_byte(disp & 0xFF);
 303       } else {
 304         // [base + index*scale + disp32]
 305         // [10 reg 100][ss index base] disp32
 306         assert(index != rsp, "illegal addressing mode");
 307         emit_byte(0x84 | regenc);
 308         emit_byte(scale << 6 | indexenc | baseenc);
 309         emit_data(disp, rspec, disp32_operand);
 310       }
 311     } else if (base == rsp LP64_ONLY(|| base == r12)) {
 312       // [rsp + disp]
 313       if (disp == 0 && rtype == relocInfo::none) {
 314         // [rsp]
 315         // [00 reg 100][00 100 100]
 316         emit_byte(0x04 | regenc);
 317         emit_byte(0x24);
 318       } else if (is8bit(disp) && rtype == relocInfo::none) {
 319         // [rsp + imm8]
 320         // [01 reg 100][00 100 100] disp8
 321         emit_byte(0x44 | regenc);
 322         emit_byte(0x24);
 323         emit_byte(disp & 0xFF);
 324       } else {
 325         // [rsp + imm32]
 326         // [10 reg 100][00 100 100] disp32
 327         emit_byte(0x84 | regenc);
 328         emit_byte(0x24);
 329         emit_data(disp, rspec, disp32_operand);
 330       }
 331     } else {
 332       // [base + disp]
 333       assert(base != rsp LP64_ONLY(&& base != r12), "illegal addressing mode");
 334       if (disp == 0 && rtype == relocInfo::none &&
 335           base != rbp LP64_ONLY(&& base != r13)) {
 336         // [base]
 337         // [00 reg base]
 338         emit_byte(0x00 | regenc | baseenc);
 339       } else if (is8bit(disp) && rtype == relocInfo::none) {
 340         // [base + disp8]
 341         // [01 reg base] disp8
 342         emit_byte(0x40 | regenc | baseenc);
 343         emit_byte(disp & 0xFF);
 344       } else {
 345         // [base + disp32]
 346         // [10 reg base] disp32
 347         emit_byte(0x80 | regenc | baseenc);
 348         emit_data(disp, rspec, disp32_operand);
 349       }
 350     }
 351   } else {
 352     if (index->is_valid()) {
 353       assert(scale != Address::no_scale, "inconsistent address");
 354       // [index*scale + disp]
 355       // [00 reg 100][ss index 101] disp32
 356       assert(index != rsp, "illegal addressing mode");
 357       emit_byte(0x04 | regenc);
 358       emit_byte(scale << 6 | indexenc | 0x05);
 359       emit_data(disp, rspec, disp32_operand);
 360     } else if (rtype != relocInfo::none ) {
 361       // [disp] (64bit) RIP-RELATIVE (32bit) abs
 362       // [00 000 101] disp32
 363 
 364       emit_byte(0x05 | regenc);
 365       // Note that the RIP-rel. correction applies to the generated
 366       // disp field, but _not_ to the target address in the rspec.
 367 
 368       // disp was created by converting the target address minus the pc
 369       // at the start of the instruction. That needs more correction here.
 370       // intptr_t disp = target - next_ip;
 371       assert(inst_mark() != NULL, "must be inside InstructionMark");
 372       address next_ip = pc() + sizeof(int32_t) + rip_relative_correction;
 373       int64_t adjusted = disp;
 374       // Do rip-rel adjustment for 64bit
 375       LP64_ONLY(adjusted -=  (next_ip - inst_mark()));
 376       assert(is_simm32(adjusted),
 377              "must be 32bit offset (RIP relative address)");
 378       emit_data((int32_t) adjusted, rspec, disp32_operand);
 379 
 380     } else {
 381       // 32bit never did this, did everything as the rip-rel/disp code above
 382       // [disp] ABSOLUTE
 383       // [00 reg 100][00 100 101] disp32
 384       emit_byte(0x04 | regenc);
 385       emit_byte(0x25);
 386       emit_data(disp, rspec, disp32_operand);
 387     }
 388   }
 389 }
 390 
 391 void Assembler::emit_operand(XMMRegister reg, Register base, Register index,
 392                              Address::ScaleFactor scale, int disp,
 393                              RelocationHolder const& rspec) {
 394   emit_operand((Register)reg, base, index, scale, disp, rspec);
 395 }
 396 
 397 // Secret local extension to Assembler::WhichOperand:
 398 #define end_pc_operand (_WhichOperand_limit)
 399 
 400 address Assembler::locate_operand(address inst, WhichOperand which) {
 401   // Decode the given instruction, and return the address of
 402   // an embedded 32-bit operand word.
 403 
 404   // If "which" is disp32_operand, selects the displacement portion
 405   // of an effective address specifier.
 406   // If "which" is imm64_operand, selects the trailing immediate constant.
 407   // If "which" is call32_operand, selects the displacement of a call or jump.
 408   // Caller is responsible for ensuring that there is such an operand,
 409   // and that it is 32/64 bits wide.
 410 
 411   // If "which" is end_pc_operand, find the end of the instruction.
 412 
 413   address ip = inst;
 414   bool is_64bit = false;
 415 
 416   debug_only(bool has_disp32 = false);
 417   int tail_size = 0; // other random bytes (#32, #16, etc.) at end of insn
 418 
 419   again_after_prefix:
 420   switch (0xFF & *ip++) {
 421 
 422   // These convenience macros generate groups of "case" labels for the switch.
 423 #define REP4(x) (x)+0: case (x)+1: case (x)+2: case (x)+3
 424 #define REP8(x) (x)+0: case (x)+1: case (x)+2: case (x)+3: \
 425              case (x)+4: case (x)+5: case (x)+6: case (x)+7
 426 #define REP16(x) REP8((x)+0): \
 427               case REP8((x)+8)
 428 
 429   case CS_segment:
 430   case SS_segment:
 431   case DS_segment:
 432   case ES_segment:
 433   case FS_segment:
 434   case GS_segment:
 435     // Seems dubious
 436     LP64_ONLY(assert(false, "shouldn't have that prefix"));
 437     assert(ip == inst+1, "only one prefix allowed");
 438     goto again_after_prefix;
 439 
 440   case 0x67:
 441   case REX:
 442   case REX_B:
 443   case REX_X:
 444   case REX_XB:
 445   case REX_R:
 446   case REX_RB:
 447   case REX_RX:
 448   case REX_RXB:
 449     NOT_LP64(assert(false, "64bit prefixes"));
 450     goto again_after_prefix;
 451 
 452   case REX_W:
 453   case REX_WB:
 454   case REX_WX:
 455   case REX_WXB:
 456   case REX_WR:
 457   case REX_WRB:
 458   case REX_WRX:
 459   case REX_WRXB:
 460     NOT_LP64(assert(false, "64bit prefixes"));
 461     is_64bit = true;
 462     goto again_after_prefix;
 463 
 464   case 0xFF: // pushq a; decl a; incl a; call a; jmp a
 465   case 0x88: // movb a, r
 466   case 0x89: // movl a, r
 467   case 0x8A: // movb r, a
 468   case 0x8B: // movl r, a
 469   case 0x8F: // popl a
 470     debug_only(has_disp32 = true);
 471     break;
 472 
 473   case 0x68: // pushq #32
 474     if (which == end_pc_operand) {
 475       return ip + 4;
 476     }
 477     assert(which == imm_operand && !is_64bit, "pushl has no disp32 or 64bit immediate");
 478     return ip;                  // not produced by emit_operand
 479 
 480   case 0x66: // movw ... (size prefix)
 481     again_after_size_prefix2:
 482     switch (0xFF & *ip++) {
 483     case REX:
 484     case REX_B:
 485     case REX_X:
 486     case REX_XB:
 487     case REX_R:
 488     case REX_RB:
 489     case REX_RX:
 490     case REX_RXB:
 491     case REX_W:
 492     case REX_WB:
 493     case REX_WX:
 494     case REX_WXB:
 495     case REX_WR:
 496     case REX_WRB:
 497     case REX_WRX:
 498     case REX_WRXB:
 499       NOT_LP64(assert(false, "64bit prefix found"));
 500       goto again_after_size_prefix2;
 501     case 0x8B: // movw r, a
 502     case 0x89: // movw a, r
 503       debug_only(has_disp32 = true);
 504       break;
 505     case 0xC7: // movw a, #16
 506       debug_only(has_disp32 = true);
 507       tail_size = 2;  // the imm16
 508       break;
 509     case 0x0F: // several SSE/SSE2 variants
 510       ip--;    // reparse the 0x0F
 511       goto again_after_prefix;
 512     default:
 513       ShouldNotReachHere();
 514     }
 515     break;
 516 
 517   case REP8(0xB8): // movl/q r, #32/#64(oop?)
 518     if (which == end_pc_operand)  return ip + (is_64bit ? 8 : 4);
 519     // these asserts are somewhat nonsensical
 520 #ifndef _LP64
 521     assert(which == imm_operand || which == disp32_operand, "");
 522 #else
 523     assert((which == call32_operand || which == imm_operand) && is_64bit ||
 524            which == narrow_oop_operand && !is_64bit, "");
 525 #endif // _LP64
 526     return ip;
 527 
 528   case 0x69: // imul r, a, #32
 529   case 0xC7: // movl a, #32(oop?)
 530     tail_size = 4;
 531     debug_only(has_disp32 = true); // has both kinds of operands!
 532     break;
 533 
 534   case 0x0F: // movx..., etc.
 535     switch (0xFF & *ip++) {
 536     case 0x12: // movlps
 537     case 0x28: // movaps
 538     case 0x2E: // ucomiss
 539     case 0x2F: // comiss
 540     case 0x54: // andps
 541     case 0x55: // andnps
 542     case 0x56: // orps
 543     case 0x57: // xorps
 544     case 0x6E: // movd
 545     case 0x7E: // movd
 546     case 0xAE: // ldmxcsr   a
 547       // 64bit side says it these have both operands but that doesn't
 548       // appear to be true
 549       debug_only(has_disp32 = true);
 550       break;
 551 
 552     case 0xAD: // shrd r, a, %cl
 553     case 0xAF: // imul r, a
 554     case 0xBE: // movsbl r, a (movsxb)
 555     case 0xBF: // movswl r, a (movsxw)
 556     case 0xB6: // movzbl r, a (movzxb)
 557     case 0xB7: // movzwl r, a (movzxw)
 558     case REP16(0x40): // cmovl cc, r, a
 559     case 0xB0: // cmpxchgb
 560     case 0xB1: // cmpxchg
 561     case 0xC1: // xaddl
 562     case 0xC7: // cmpxchg8
 563     case REP16(0x90): // setcc a
 564       debug_only(has_disp32 = true);
 565       // fall out of the switch to decode the address
 566       break;
 567 
 568     case 0xAC: // shrd r, a, #8
 569       debug_only(has_disp32 = true);
 570       tail_size = 1;  // the imm8
 571       break;
 572 
 573     case REP16(0x80): // jcc rdisp32
 574       if (which == end_pc_operand)  return ip + 4;
 575       assert(which == call32_operand, "jcc has no disp32 or imm");
 576       return ip;
 577     default:
 578       ShouldNotReachHere();
 579     }
 580     break;
 581 
 582   case 0x81: // addl a, #32; addl r, #32
 583     // also: orl, adcl, sbbl, andl, subl, xorl, cmpl
 584     // on 32bit in the case of cmpl, the imm might be an oop
 585     tail_size = 4;
 586     debug_only(has_disp32 = true); // has both kinds of operands!
 587     break;
 588 
 589   case 0x83: // addl a, #8; addl r, #8
 590     // also: orl, adcl, sbbl, andl, subl, xorl, cmpl
 591     debug_only(has_disp32 = true); // has both kinds of operands!
 592     tail_size = 1;
 593     break;
 594 
 595   case 0x9B:
 596     switch (0xFF & *ip++) {
 597     case 0xD9: // fnstcw a
 598       debug_only(has_disp32 = true);
 599       break;
 600     default:
 601       ShouldNotReachHere();
 602     }
 603     break;
 604 
 605   case REP4(0x00): // addb a, r; addl a, r; addb r, a; addl r, a
 606   case REP4(0x10): // adc...
 607   case REP4(0x20): // and...
 608   case REP4(0x30): // xor...
 609   case REP4(0x08): // or...
 610   case REP4(0x18): // sbb...
 611   case REP4(0x28): // sub...
 612   case 0xF7: // mull a
 613   case 0x8D: // lea r, a
 614   case 0x87: // xchg r, a
 615   case REP4(0x38): // cmp...
 616   case 0x85: // test r, a
 617     debug_only(has_disp32 = true); // has both kinds of operands!
 618     break;
 619 
 620   case 0xC1: // sal a, #8; sar a, #8; shl a, #8; shr a, #8
 621   case 0xC6: // movb a, #8
 622   case 0x80: // cmpb a, #8
 623   case 0x6B: // imul r, a, #8
 624     debug_only(has_disp32 = true); // has both kinds of operands!
 625     tail_size = 1; // the imm8
 626     break;
 627 
 628   case 0xE8: // call rdisp32
 629   case 0xE9: // jmp  rdisp32
 630     if (which == end_pc_operand)  return ip + 4;
 631     assert(which == call32_operand, "call has no disp32 or imm");
 632     return ip;
 633 
 634   case 0xD1: // sal a, 1; sar a, 1; shl a, 1; shr a, 1
 635   case 0xD3: // sal a, %cl; sar a, %cl; shl a, %cl; shr a, %cl
 636   case 0xD9: // fld_s a; fst_s a; fstp_s a; fldcw a
 637   case 0xDD: // fld_d a; fst_d a; fstp_d a
 638   case 0xDB: // fild_s a; fistp_s a; fld_x a; fstp_x a
 639   case 0xDF: // fild_d a; fistp_d a
 640   case 0xD8: // fadd_s a; fsubr_s a; fmul_s a; fdivr_s a; fcomp_s a
 641   case 0xDC: // fadd_d a; fsubr_d a; fmul_d a; fdivr_d a; fcomp_d a
 642   case 0xDE: // faddp_d a; fsubrp_d a; fmulp_d a; fdivrp_d a; fcompp_d a
 643     debug_only(has_disp32 = true);
 644     break;
 645 
 646   case 0xF0:                    // Lock
 647     assert(os::is_MP(), "only on MP");
 648     goto again_after_prefix;
 649 
 650   case 0xF3:                    // For SSE
 651   case 0xF2:                    // For SSE2
 652     switch (0xFF & *ip++) {
 653     case REX:
 654     case REX_B:
 655     case REX_X:
 656     case REX_XB:
 657     case REX_R:
 658     case REX_RB:
 659     case REX_RX:
 660     case REX_RXB:
 661     case REX_W:
 662     case REX_WB:
 663     case REX_WX:
 664     case REX_WXB:
 665     case REX_WR:
 666     case REX_WRB:
 667     case REX_WRX:
 668     case REX_WRXB:
 669       NOT_LP64(assert(false, "found 64bit prefix"));
 670       ip++;
 671     default:
 672       ip++;
 673     }
 674     debug_only(has_disp32 = true); // has both kinds of operands!
 675     break;
 676 
 677   default:
 678     ShouldNotReachHere();
 679 
 680 #undef REP8
 681 #undef REP16
 682   }
 683 
 684   assert(which != call32_operand, "instruction is not a call, jmp, or jcc");
 685 #ifdef _LP64
 686   assert(which != imm_operand, "instruction is not a movq reg, imm64");
 687 #else
 688   // assert(which != imm_operand || has_imm32, "instruction has no imm32 field");
 689   assert(which != imm_operand || has_disp32, "instruction has no imm32 field");
 690 #endif // LP64
 691   assert(which != disp32_operand || has_disp32, "instruction has no disp32 field");
 692 
 693   // parse the output of emit_operand
 694   int op2 = 0xFF & *ip++;
 695   int base = op2 & 0x07;
 696   int op3 = -1;
 697   const int b100 = 4;
 698   const int b101 = 5;
 699   if (base == b100 && (op2 >> 6) != 3) {
 700     op3 = 0xFF & *ip++;
 701     base = op3 & 0x07;   // refetch the base
 702   }
 703   // now ip points at the disp (if any)
 704 
 705   switch (op2 >> 6) {
 706   case 0:
 707     // [00 reg  100][ss index base]
 708     // [00 reg  100][00   100  esp]
 709     // [00 reg base]
 710     // [00 reg  100][ss index  101][disp32]
 711     // [00 reg  101]               [disp32]
 712 
 713     if (base == b101) {
 714       if (which == disp32_operand)
 715         return ip;              // caller wants the disp32
 716       ip += 4;                  // skip the disp32
 717     }
 718     break;
 719 
 720   case 1:
 721     // [01 reg  100][ss index base][disp8]
 722     // [01 reg  100][00   100  esp][disp8]
 723     // [01 reg base]               [disp8]
 724     ip += 1;                    // skip the disp8
 725     break;
 726 
 727   case 2:
 728     // [10 reg  100][ss index base][disp32]
 729     // [10 reg  100][00   100  esp][disp32]
 730     // [10 reg base]               [disp32]
 731     if (which == disp32_operand)
 732       return ip;                // caller wants the disp32
 733     ip += 4;                    // skip the disp32
 734     break;
 735 
 736   case 3:
 737     // [11 reg base]  (not a memory addressing mode)
 738     break;
 739   }
 740 
 741   if (which == end_pc_operand) {
 742     return ip + tail_size;
 743   }
 744 
 745 #ifdef _LP64
 746   assert(which == narrow_oop_operand && !is_64bit, "instruction is not a movl adr, imm32");
 747 #else
 748   assert(which == imm_operand, "instruction has only an imm field");
 749 #endif // LP64
 750   return ip;
 751 }
 752 
 753 address Assembler::locate_next_instruction(address inst) {
 754   // Secretly share code with locate_operand:
 755   return locate_operand(inst, end_pc_operand);
 756 }
 757 
 758 
 759 #ifdef ASSERT
 760 void Assembler::check_relocation(RelocationHolder const& rspec, int format) {
 761   address inst = inst_mark();
 762   assert(inst != NULL && inst < pc(), "must point to beginning of instruction");
 763   address opnd;
 764 
 765   Relocation* r = rspec.reloc();
 766   if (r->type() == relocInfo::none) {
 767     return;
 768   } else if (r->is_call() || format == call32_operand) {
 769     // assert(format == imm32_operand, "cannot specify a nonzero format");
 770     opnd = locate_operand(inst, call32_operand);
 771   } else if (r->is_data()) {
 772     assert(format == imm_operand || format == disp32_operand
 773            LP64_ONLY(|| format == narrow_oop_operand), "format ok");
 774     opnd = locate_operand(inst, (WhichOperand)format);
 775   } else {
 776     assert(format == imm_operand, "cannot specify a format");
 777     return;
 778   }
 779   assert(opnd == pc(), "must put operand where relocs can find it");
 780 }
 781 #endif // ASSERT
 782 
 783 void Assembler::emit_operand32(Register reg, Address adr) {
 784   assert(reg->encoding() < 8, "no extended registers");
 785   assert(!adr.base_needs_rex() && !adr.index_needs_rex(), "no extended registers");
 786   emit_operand(reg, adr._base, adr._index, adr._scale, adr._disp,
 787                adr._rspec);
 788 }
 789 
 790 void Assembler::emit_operand(Register reg, Address adr,
 791                              int rip_relative_correction) {
 792   emit_operand(reg, adr._base, adr._index, adr._scale, adr._disp,
 793                adr._rspec,
 794                rip_relative_correction);
 795 }
 796 
 797 void Assembler::emit_operand(XMMRegister reg, Address adr) {
 798   emit_operand(reg, adr._base, adr._index, adr._scale, adr._disp,
 799                adr._rspec);
 800 }
 801 
 802 // MMX operations
 803 void Assembler::emit_operand(MMXRegister reg, Address adr) {
 804   assert(!adr.base_needs_rex() && !adr.index_needs_rex(), "no extended registers");
 805   emit_operand((Register)reg, adr._base, adr._index, adr._scale, adr._disp, adr._rspec);
 806 }
 807 
 808 // work around gcc (3.2.1-7a) bug
 809 void Assembler::emit_operand(Address adr, MMXRegister reg) {
 810   assert(!adr.base_needs_rex() && !adr.index_needs_rex(), "no extended registers");
 811   emit_operand((Register)reg, adr._base, adr._index, adr._scale, adr._disp, adr._rspec);
 812 }
 813 
 814 
 815 void Assembler::emit_farith(int b1, int b2, int i) {
 816   assert(isByte(b1) && isByte(b2), "wrong opcode");
 817   assert(0 <= i &&  i < 8, "illegal stack offset");
 818   emit_byte(b1);
 819   emit_byte(b2 + i);
 820 }
 821 
 822 
 823 // Now the Assembler instructions (identical for 32/64 bits)
 824 
 825 void Assembler::adcl(Address dst, int32_t imm32) {
 826   InstructionMark im(this);
 827   prefix(dst);
 828   emit_arith_operand(0x81, rdx, dst, imm32);
 829 }
 830 
 831 void Assembler::adcl(Address dst, Register src) {
 832   InstructionMark im(this);
 833   prefix(dst, src);
 834   emit_byte(0x11);
 835   emit_operand(src, dst);
 836 }
 837 
 838 void Assembler::adcl(Register dst, int32_t imm32) {
 839   prefix(dst);
 840   emit_arith(0x81, 0xD0, dst, imm32);
 841 }
 842 
 843 void Assembler::adcl(Register dst, Address src) {
 844   InstructionMark im(this);
 845   prefix(src, dst);
 846   emit_byte(0x13);
 847   emit_operand(dst, src);
 848 }
 849 
 850 void Assembler::adcl(Register dst, Register src) {
 851   (void) prefix_and_encode(dst->encoding(), src->encoding());
 852   emit_arith(0x13, 0xC0, dst, src);
 853 }
 854 
 855 void Assembler::addl(Address dst, int32_t imm32) {
 856   InstructionMark im(this);
 857   prefix(dst);
 858   emit_arith_operand(0x81, rax, dst, imm32);
 859 }
 860 
 861 void Assembler::addl(Address dst, Register src) {
 862   InstructionMark im(this);
 863   prefix(dst, src);
 864   emit_byte(0x01);
 865   emit_operand(src, dst);
 866 }
 867 
 868 void Assembler::addl(Register dst, int32_t imm32) {
 869   prefix(dst);
 870   emit_arith(0x81, 0xC0, dst, imm32);
 871 }
 872 
 873 void Assembler::addl(Register dst, Address src) {
 874   InstructionMark im(this);
 875   prefix(src, dst);
 876   emit_byte(0x03);
 877   emit_operand(dst, src);
 878 }
 879 
 880 void Assembler::addl(Register dst, Register src) {
 881   (void) prefix_and_encode(dst->encoding(), src->encoding());
 882   emit_arith(0x03, 0xC0, dst, src);
 883 }
 884 
 885 void Assembler::addr_nop_4() {
 886   // 4 bytes: NOP DWORD PTR [EAX+0]
 887   emit_byte(0x0F);
 888   emit_byte(0x1F);
 889   emit_byte(0x40); // emit_rm(cbuf, 0x1, EAX_enc, EAX_enc);
 890   emit_byte(0);    // 8-bits offset (1 byte)
 891 }
 892 
 893 void Assembler::addr_nop_5() {
 894   // 5 bytes: NOP DWORD PTR [EAX+EAX*0+0] 8-bits offset
 895   emit_byte(0x0F);
 896   emit_byte(0x1F);
 897   emit_byte(0x44); // emit_rm(cbuf, 0x1, EAX_enc, 0x4);
 898   emit_byte(0x00); // emit_rm(cbuf, 0x0, EAX_enc, EAX_enc);
 899   emit_byte(0);    // 8-bits offset (1 byte)
 900 }
 901 
 902 void Assembler::addr_nop_7() {
 903   // 7 bytes: NOP DWORD PTR [EAX+0] 32-bits offset
 904   emit_byte(0x0F);
 905   emit_byte(0x1F);
 906   emit_byte(0x80); // emit_rm(cbuf, 0x2, EAX_enc, EAX_enc);
 907   emit_long(0);    // 32-bits offset (4 bytes)
 908 }
 909 
 910 void Assembler::addr_nop_8() {
 911   // 8 bytes: NOP DWORD PTR [EAX+EAX*0+0] 32-bits offset
 912   emit_byte(0x0F);
 913   emit_byte(0x1F);
 914   emit_byte(0x84); // emit_rm(cbuf, 0x2, EAX_enc, 0x4);
 915   emit_byte(0x00); // emit_rm(cbuf, 0x0, EAX_enc, EAX_enc);
 916   emit_long(0);    // 32-bits offset (4 bytes)
 917 }
 918 
 919 void Assembler::addsd(XMMRegister dst, XMMRegister src) {
 920   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
 921   emit_byte(0xF2);
 922   int encode = prefix_and_encode(dst->encoding(), src->encoding());
 923   emit_byte(0x0F);
 924   emit_byte(0x58);
 925   emit_byte(0xC0 | encode);
 926 }
 927 
 928 void Assembler::addsd(XMMRegister dst, Address src) {
 929   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
 930   InstructionMark im(this);
 931   emit_byte(0xF2);
 932   prefix(src, dst);
 933   emit_byte(0x0F);
 934   emit_byte(0x58);
 935   emit_operand(dst, src);
 936 }
 937 
 938 void Assembler::addss(XMMRegister dst, XMMRegister src) {
 939   NOT_LP64(assert(VM_Version::supports_sse(), ""));
 940   emit_byte(0xF3);
 941   int encode = prefix_and_encode(dst->encoding(), src->encoding());
 942   emit_byte(0x0F);
 943   emit_byte(0x58);
 944   emit_byte(0xC0 | encode);
 945 }
 946 
 947 void Assembler::addss(XMMRegister dst, Address src) {
 948   NOT_LP64(assert(VM_Version::supports_sse(), ""));
 949   InstructionMark im(this);
 950   emit_byte(0xF3);
 951   prefix(src, dst);
 952   emit_byte(0x0F);
 953   emit_byte(0x58);
 954   emit_operand(dst, src);
 955 }
 956 
 957 void Assembler::andl(Register dst, int32_t imm32) {
 958   prefix(dst);
 959   emit_arith(0x81, 0xE0, dst, imm32);
 960 }
 961 
 962 void Assembler::andl(Register dst, Address src) {
 963   InstructionMark im(this);
 964   prefix(src, dst);
 965   emit_byte(0x23);
 966   emit_operand(dst, src);
 967 }
 968 
 969 void Assembler::andl(Register dst, Register src) {
 970   (void) prefix_and_encode(dst->encoding(), src->encoding());
 971   emit_arith(0x23, 0xC0, dst, src);
 972 }
 973 
 974 void Assembler::andpd(XMMRegister dst, Address src) {
 975   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
 976   InstructionMark im(this);
 977   emit_byte(0x66);
 978   prefix(src, dst);
 979   emit_byte(0x0F);
 980   emit_byte(0x54);
 981   emit_operand(dst, src);
 982 }
 983 
 984 void Assembler::bsfl(Register dst, Register src) {
 985   int encode = prefix_and_encode(dst->encoding(), src->encoding());
 986   emit_byte(0x0F);
 987   emit_byte(0xBC);
 988   emit_byte(0xC0 | encode);
 989 }
 990 
 991 void Assembler::bsrl(Register dst, Register src) {
 992   assert(!VM_Version::supports_lzcnt(), "encoding is treated as LZCNT");
 993   int encode = prefix_and_encode(dst->encoding(), src->encoding());
 994   emit_byte(0x0F);
 995   emit_byte(0xBD);
 996   emit_byte(0xC0 | encode);
 997 }
 998 
 999 void Assembler::bswapl(Register reg) { // bswap
1000   int encode = prefix_and_encode(reg->encoding());
1001   emit_byte(0x0F);
1002   emit_byte(0xC8 | encode);
1003 }
1004 
1005 void Assembler::call(Label& L, relocInfo::relocType rtype) {
1006   // suspect disp32 is always good
1007   int operand = LP64_ONLY(disp32_operand) NOT_LP64(imm_operand);
1008 
1009   if (L.is_bound()) {
1010     const int long_size = 5;
1011     int offs = (int)( target(L) - pc() );
1012     assert(offs <= 0, "assembler error");
1013     InstructionMark im(this);
1014     // 1110 1000 #32-bit disp
1015     emit_byte(0xE8);
1016     emit_data(offs - long_size, rtype, operand);
1017   } else {
1018     InstructionMark im(this);
1019     // 1110 1000 #32-bit disp
1020     L.add_patch_at(code(), locator());
1021 
1022     emit_byte(0xE8);
1023     emit_data(int(0), rtype, operand);
1024   }
1025 }
1026 
1027 void Assembler::call(Register dst) {
1028   // This was originally using a 32bit register encoding
1029   // and surely we want 64bit!
1030   // this is a 32bit encoding but in 64bit mode the default
1031   // operand size is 64bit so there is no need for the
1032   // wide prefix. So prefix only happens if we use the
1033   // new registers. Much like push/pop.
1034   int x = offset();
1035   // this may be true but dbx disassembles it as if it
1036   // were 32bits...
1037   // int encode = prefix_and_encode(dst->encoding());
1038   // if (offset() != x) assert(dst->encoding() >= 8, "what?");
1039   int encode = prefixq_and_encode(dst->encoding());
1040 
1041   emit_byte(0xFF);
1042   emit_byte(0xD0 | encode);
1043 }
1044 
1045 
1046 void Assembler::call(Address adr) {
1047   InstructionMark im(this);
1048   prefix(adr);
1049   emit_byte(0xFF);
1050   emit_operand(rdx, adr);
1051 }
1052 
1053 void Assembler::call_literal(address entry, RelocationHolder const& rspec) {
1054   assert(entry != NULL, "call most probably wrong");
1055   InstructionMark im(this);
1056   emit_byte(0xE8);
1057   intptr_t disp = entry - (_code_pos + sizeof(int32_t));
1058   assert(is_simm32(disp), "must be 32bit offset (call2)");
1059   // Technically, should use call32_operand, but this format is
1060   // implied by the fact that we're emitting a call instruction.
1061 
1062   int operand = LP64_ONLY(disp32_operand) NOT_LP64(call32_operand);
1063   emit_data((int) disp, rspec, operand);
1064 }
1065 
1066 void Assembler::cdql() {
1067   emit_byte(0x99);
1068 }
1069 
1070 void Assembler::cmovl(Condition cc, Register dst, Register src) {
1071   NOT_LP64(guarantee(VM_Version::supports_cmov(), "illegal instruction"));
1072   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1073   emit_byte(0x0F);
1074   emit_byte(0x40 | cc);
1075   emit_byte(0xC0 | encode);
1076 }
1077 
1078 
1079 void Assembler::cmovl(Condition cc, Register dst, Address src) {
1080   NOT_LP64(guarantee(VM_Version::supports_cmov(), "illegal instruction"));
1081   prefix(src, dst);
1082   emit_byte(0x0F);
1083   emit_byte(0x40 | cc);
1084   emit_operand(dst, src);
1085 }
1086 
1087 void Assembler::cmpb(Address dst, int imm8) {
1088   InstructionMark im(this);
1089   prefix(dst);
1090   emit_byte(0x80);
1091   emit_operand(rdi, dst, 1);
1092   emit_byte(imm8);
1093 }
1094 
1095 void Assembler::cmpl(Address dst, int32_t imm32) {
1096   InstructionMark im(this);
1097   prefix(dst);
1098   emit_byte(0x81);
1099   emit_operand(rdi, dst, 4);
1100   emit_long(imm32);
1101 }
1102 
1103 void Assembler::cmpl(Register dst, int32_t imm32) {
1104   prefix(dst);
1105   emit_arith(0x81, 0xF8, dst, imm32);
1106 }
1107 
1108 void Assembler::cmpl(Register dst, Register src) {
1109   (void) prefix_and_encode(dst->encoding(), src->encoding());
1110   emit_arith(0x3B, 0xC0, dst, src);
1111 }
1112 
1113 
1114 void Assembler::cmpl(Register dst, Address  src) {
1115   InstructionMark im(this);
1116   prefix(src, dst);
1117   emit_byte(0x3B);
1118   emit_operand(dst, src);
1119 }
1120 
1121 void Assembler::cmpw(Address dst, int imm16) {
1122   InstructionMark im(this);
1123   assert(!dst.base_needs_rex() && !dst.index_needs_rex(), "no extended registers");
1124   emit_byte(0x66);
1125   emit_byte(0x81);
1126   emit_operand(rdi, dst, 2);
1127   emit_word(imm16);
1128 }
1129 
1130 // The 32-bit cmpxchg compares the value at adr with the contents of rax,
1131 // and stores reg into adr if so; otherwise, the value at adr is loaded into rax,.
1132 // The ZF is set if the compared values were equal, and cleared otherwise.
1133 void Assembler::cmpxchgl(Register reg, Address adr) { // cmpxchg
1134   if (Atomics & 2) {
1135      // caveat: no instructionmark, so this isn't relocatable.
1136      // Emit a synthetic, non-atomic, CAS equivalent.
1137      // Beware.  The synthetic form sets all ICCs, not just ZF.
1138      // cmpxchg r,[m] is equivalent to rax, = CAS (m, rax, r)
1139      cmpl(rax, adr);
1140      movl(rax, adr);
1141      if (reg != rax) {
1142         Label L ;
1143         jcc(Assembler::notEqual, L);
1144         movl(adr, reg);
1145         bind(L);
1146      }
1147   } else {
1148      InstructionMark im(this);
1149      prefix(adr, reg);
1150      emit_byte(0x0F);
1151      emit_byte(0xB1);
1152      emit_operand(reg, adr);
1153   }
1154 }
1155 
1156 void Assembler::comisd(XMMRegister dst, Address src) {
1157   // NOTE: dbx seems to decode this as comiss even though the
1158   // 0x66 is there. Strangly ucomisd comes out correct
1159   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1160   emit_byte(0x66);
1161   comiss(dst, src);
1162 }
1163 
1164 void Assembler::comiss(XMMRegister dst, Address src) {
1165   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1166 
1167   InstructionMark im(this);
1168   prefix(src, dst);
1169   emit_byte(0x0F);
1170   emit_byte(0x2F);
1171   emit_operand(dst, src);
1172 }
1173 
1174 void Assembler::cvtdq2pd(XMMRegister dst, XMMRegister src) {
1175   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1176   emit_byte(0xF3);
1177   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1178   emit_byte(0x0F);
1179   emit_byte(0xE6);
1180   emit_byte(0xC0 | encode);
1181 }
1182 
1183 void Assembler::cvtdq2ps(XMMRegister dst, XMMRegister src) {
1184   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1185   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1186   emit_byte(0x0F);
1187   emit_byte(0x5B);
1188   emit_byte(0xC0 | encode);
1189 }
1190 
1191 void Assembler::cvtsd2ss(XMMRegister dst, XMMRegister src) {
1192   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1193   emit_byte(0xF2);
1194   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1195   emit_byte(0x0F);
1196   emit_byte(0x5A);
1197   emit_byte(0xC0 | encode);
1198 }
1199 
1200 void Assembler::cvtsi2sdl(XMMRegister dst, Register src) {
1201   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1202   emit_byte(0xF2);
1203   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1204   emit_byte(0x0F);
1205   emit_byte(0x2A);
1206   emit_byte(0xC0 | encode);
1207 }
1208 
1209 void Assembler::cvtsi2ssl(XMMRegister dst, Register src) {
1210   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1211   emit_byte(0xF3);
1212   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1213   emit_byte(0x0F);
1214   emit_byte(0x2A);
1215   emit_byte(0xC0 | encode);
1216 }
1217 
1218 void Assembler::cvtss2sd(XMMRegister dst, XMMRegister src) {
1219   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1220   emit_byte(0xF3);
1221   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1222   emit_byte(0x0F);
1223   emit_byte(0x5A);
1224   emit_byte(0xC0 | encode);
1225 }
1226 
1227 void Assembler::cvttsd2sil(Register dst, XMMRegister src) {
1228   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1229   emit_byte(0xF2);
1230   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1231   emit_byte(0x0F);
1232   emit_byte(0x2C);
1233   emit_byte(0xC0 | encode);
1234 }
1235 
1236 void Assembler::cvttss2sil(Register dst, XMMRegister src) {
1237   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1238   emit_byte(0xF3);
1239   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1240   emit_byte(0x0F);
1241   emit_byte(0x2C);
1242   emit_byte(0xC0 | encode);
1243 }
1244 
1245 void Assembler::decl(Address dst) {
1246   // Don't use it directly. Use MacroAssembler::decrement() instead.
1247   InstructionMark im(this);
1248   prefix(dst);
1249   emit_byte(0xFF);
1250   emit_operand(rcx, dst);
1251 }
1252 
1253 void Assembler::divsd(XMMRegister dst, Address src) {
1254   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1255   InstructionMark im(this);
1256   emit_byte(0xF2);
1257   prefix(src, dst);
1258   emit_byte(0x0F);
1259   emit_byte(0x5E);
1260   emit_operand(dst, src);
1261 }
1262 
1263 void Assembler::divsd(XMMRegister dst, XMMRegister src) {
1264   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1265   emit_byte(0xF2);
1266   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1267   emit_byte(0x0F);
1268   emit_byte(0x5E);
1269   emit_byte(0xC0 | encode);
1270 }
1271 
1272 void Assembler::divss(XMMRegister dst, Address src) {
1273   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1274   InstructionMark im(this);
1275   emit_byte(0xF3);
1276   prefix(src, dst);
1277   emit_byte(0x0F);
1278   emit_byte(0x5E);
1279   emit_operand(dst, src);
1280 }
1281 
1282 void Assembler::divss(XMMRegister dst, XMMRegister src) {
1283   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1284   emit_byte(0xF3);
1285   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1286   emit_byte(0x0F);
1287   emit_byte(0x5E);
1288   emit_byte(0xC0 | encode);
1289 }
1290 
1291 void Assembler::emms() {
1292   NOT_LP64(assert(VM_Version::supports_mmx(), ""));
1293   emit_byte(0x0F);
1294   emit_byte(0x77);
1295 }
1296 
1297 void Assembler::hlt() {
1298   emit_byte(0xF4);
1299 }
1300 
1301 void Assembler::idivl(Register src) {
1302   int encode = prefix_and_encode(src->encoding());
1303   emit_byte(0xF7);
1304   emit_byte(0xF8 | encode);
1305 }
1306 
1307 void Assembler::divl(Register src) { // Unsigned
1308   int encode = prefix_and_encode(src->encoding());
1309   emit_byte(0xF7);
1310   emit_byte(0xF0 | encode);
1311 }
1312 
1313 void Assembler::imull(Register dst, Register src) {
1314   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1315   emit_byte(0x0F);
1316   emit_byte(0xAF);
1317   emit_byte(0xC0 | encode);
1318 }
1319 
1320 
1321 void Assembler::imull(Register dst, Register src, int value) {
1322   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1323   if (is8bit(value)) {
1324     emit_byte(0x6B);
1325     emit_byte(0xC0 | encode);
1326     emit_byte(value & 0xFF);
1327   } else {
1328     emit_byte(0x69);
1329     emit_byte(0xC0 | encode);
1330     emit_long(value);
1331   }
1332 }
1333 
1334 void Assembler::incl(Address dst) {
1335   // Don't use it directly. Use MacroAssembler::increment() instead.
1336   InstructionMark im(this);
1337   prefix(dst);
1338   emit_byte(0xFF);
1339   emit_operand(rax, dst);
1340 }
1341 
1342 void Assembler::jcc(Condition cc, Label& L, bool maybe_short) {
1343   InstructionMark im(this);
1344   assert((0 <= cc) && (cc < 16), "illegal cc");
1345   if (L.is_bound()) {
1346     address dst = target(L);
1347     assert(dst != NULL, "jcc most probably wrong");
1348 
1349     const int short_size = 2;
1350     const int long_size = 6;
1351     intptr_t offs = (intptr_t)dst - (intptr_t)_code_pos;
1352     if (maybe_short && is8bit(offs - short_size)) {
1353       // 0111 tttn #8-bit disp
1354       emit_byte(0x70 | cc);
1355       emit_byte((offs - short_size) & 0xFF);
1356     } else {
1357       // 0000 1111 1000 tttn #32-bit disp
1358       assert(is_simm32(offs - long_size),
1359              "must be 32bit offset (call4)");
1360       emit_byte(0x0F);
1361       emit_byte(0x80 | cc);
1362       emit_long(offs - long_size);
1363     }
1364   } else {
1365     // Note: could eliminate cond. jumps to this jump if condition
1366     //       is the same however, seems to be rather unlikely case.
1367     // Note: use jccb() if label to be bound is very close to get
1368     //       an 8-bit displacement
1369     L.add_patch_at(code(), locator());
1370     emit_byte(0x0F);
1371     emit_byte(0x80 | cc);
1372     emit_long(0);
1373   }
1374 }
1375 
1376 void Assembler::jccb(Condition cc, Label& L) {
1377   if (L.is_bound()) {
1378     const int short_size = 2;
1379     address entry = target(L);
1380     assert(is8bit((intptr_t)entry - ((intptr_t)_code_pos + short_size)),
1381            "Dispacement too large for a short jmp");
1382     intptr_t offs = (intptr_t)entry - (intptr_t)_code_pos;
1383     // 0111 tttn #8-bit disp
1384     emit_byte(0x70 | cc);
1385     emit_byte((offs - short_size) & 0xFF);
1386   } else {
1387     InstructionMark im(this);
1388     L.add_patch_at(code(), locator());
1389     emit_byte(0x70 | cc);
1390     emit_byte(0);
1391   }
1392 }
1393 
1394 void Assembler::jmp(Address adr) {
1395   InstructionMark im(this);
1396   prefix(adr);
1397   emit_byte(0xFF);
1398   emit_operand(rsp, adr);
1399 }
1400 
1401 void Assembler::jmp(Label& L, bool maybe_short) {
1402   if (L.is_bound()) {
1403     address entry = target(L);
1404     assert(entry != NULL, "jmp most probably wrong");
1405     InstructionMark im(this);
1406     const int short_size = 2;
1407     const int long_size = 5;
1408     intptr_t offs = entry - _code_pos;
1409     if (maybe_short && is8bit(offs - short_size)) {
1410       emit_byte(0xEB);
1411       emit_byte((offs - short_size) & 0xFF);
1412     } else {
1413       emit_byte(0xE9);
1414       emit_long(offs - long_size);
1415     }
1416   } else {
1417     // By default, forward jumps are always 32-bit displacements, since
1418     // we can't yet know where the label will be bound.  If you're sure that
1419     // the forward jump will not run beyond 256 bytes, use jmpb to
1420     // force an 8-bit displacement.
1421     InstructionMark im(this);
1422     L.add_patch_at(code(), locator());
1423     emit_byte(0xE9);
1424     emit_long(0);
1425   }
1426 }
1427 
1428 void Assembler::jmp(Register entry) {
1429   int encode = prefix_and_encode(entry->encoding());
1430   emit_byte(0xFF);
1431   emit_byte(0xE0 | encode);
1432 }
1433 
1434 void Assembler::jmp_literal(address dest, RelocationHolder const& rspec) {
1435   InstructionMark im(this);
1436   emit_byte(0xE9);
1437   assert(dest != NULL, "must have a target");
1438   intptr_t disp = dest - (_code_pos + sizeof(int32_t));
1439   assert(is_simm32(disp), "must be 32bit offset (jmp)");
1440   emit_data(disp, rspec.reloc(), call32_operand);
1441 }
1442 
1443 void Assembler::jmpb(Label& L) {
1444   if (L.is_bound()) {
1445     const int short_size = 2;
1446     address entry = target(L);
1447     assert(is8bit((entry - _code_pos) + short_size),
1448            "Dispacement too large for a short jmp");
1449     assert(entry != NULL, "jmp most probably wrong");
1450     intptr_t offs = entry - _code_pos;
1451     emit_byte(0xEB);
1452     emit_byte((offs - short_size) & 0xFF);
1453   } else {
1454     InstructionMark im(this);
1455     L.add_patch_at(code(), locator());
1456     emit_byte(0xEB);
1457     emit_byte(0);
1458   }
1459 }
1460 
1461 void Assembler::ldmxcsr( Address src) {
1462   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1463   InstructionMark im(this);
1464   prefix(src);
1465   emit_byte(0x0F);
1466   emit_byte(0xAE);
1467   emit_operand(as_Register(2), src);
1468 }
1469 
1470 void Assembler::leal(Register dst, Address src) {
1471   InstructionMark im(this);
1472 #ifdef _LP64
1473   emit_byte(0x67); // addr32
1474   prefix(src, dst);
1475 #endif // LP64
1476   emit_byte(0x8D);
1477   emit_operand(dst, src);
1478 }
1479 
1480 void Assembler::lock() {
1481   if (Atomics & 1) {
1482      // Emit either nothing, a NOP, or a NOP: prefix
1483      emit_byte(0x90) ;
1484   } else {
1485      emit_byte(0xF0);
1486   }
1487 }
1488 
1489 void Assembler::lzcntl(Register dst, Register src) {
1490   assert(VM_Version::supports_lzcnt(), "encoding is treated as BSR");
1491   emit_byte(0xF3);
1492   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1493   emit_byte(0x0F);
1494   emit_byte(0xBD);
1495   emit_byte(0xC0 | encode);
1496 }
1497 
1498 // Emit mfence instruction
1499 void Assembler::mfence() {
1500   NOT_LP64(assert(VM_Version::supports_sse2(), "unsupported");)
1501   emit_byte( 0x0F );
1502   emit_byte( 0xAE );
1503   emit_byte( 0xF0 );
1504 }
1505 
1506 void Assembler::mov(Register dst, Register src) {
1507   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
1508 }
1509 
1510 void Assembler::movapd(XMMRegister dst, XMMRegister src) {
1511   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1512   int dstenc = dst->encoding();
1513   int srcenc = src->encoding();
1514   emit_byte(0x66);
1515   if (dstenc < 8) {
1516     if (srcenc >= 8) {
1517       prefix(REX_B);
1518       srcenc -= 8;
1519     }
1520   } else {
1521     if (srcenc < 8) {
1522       prefix(REX_R);
1523     } else {
1524       prefix(REX_RB);
1525       srcenc -= 8;
1526     }
1527     dstenc -= 8;
1528   }
1529   emit_byte(0x0F);
1530   emit_byte(0x28);
1531   emit_byte(0xC0 | dstenc << 3 | srcenc);
1532 }
1533 
1534 void Assembler::movaps(XMMRegister dst, XMMRegister src) {
1535   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1536   int dstenc = dst->encoding();
1537   int srcenc = src->encoding();
1538   if (dstenc < 8) {
1539     if (srcenc >= 8) {
1540       prefix(REX_B);
1541       srcenc -= 8;
1542     }
1543   } else {
1544     if (srcenc < 8) {
1545       prefix(REX_R);
1546     } else {
1547       prefix(REX_RB);
1548       srcenc -= 8;
1549     }
1550     dstenc -= 8;
1551   }
1552   emit_byte(0x0F);
1553   emit_byte(0x28);
1554   emit_byte(0xC0 | dstenc << 3 | srcenc);
1555 }
1556 
1557 void Assembler::movb(Register dst, Address src) {
1558   NOT_LP64(assert(dst->has_byte_register(), "must have byte register"));
1559   InstructionMark im(this);
1560   prefix(src, dst, true);
1561   emit_byte(0x8A);
1562   emit_operand(dst, src);
1563 }
1564 
1565 
1566 void Assembler::movb(Address dst, int imm8) {
1567   InstructionMark im(this);
1568    prefix(dst);
1569   emit_byte(0xC6);
1570   emit_operand(rax, dst, 1);
1571   emit_byte(imm8);
1572 }
1573 
1574 
1575 void Assembler::movb(Address dst, Register src) {
1576   assert(src->has_byte_register(), "must have byte register");
1577   InstructionMark im(this);
1578   prefix(dst, src, true);
1579   emit_byte(0x88);
1580   emit_operand(src, dst);
1581 }
1582 
1583 void Assembler::movdl(XMMRegister dst, Register src) {
1584   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1585   emit_byte(0x66);
1586   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1587   emit_byte(0x0F);
1588   emit_byte(0x6E);
1589   emit_byte(0xC0 | encode);
1590 }
1591 
1592 void Assembler::movdl(Register dst, XMMRegister src) {
1593   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1594   emit_byte(0x66);
1595   // swap src/dst to get correct prefix
1596   int encode = prefix_and_encode(src->encoding(), dst->encoding());
1597   emit_byte(0x0F);
1598   emit_byte(0x7E);
1599   emit_byte(0xC0 | encode);
1600 }
1601 
1602 void Assembler::movdl(XMMRegister dst, Address src) {
1603   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1604   InstructionMark im(this);
1605   emit_byte(0x66);
1606   prefix(src, dst);
1607   emit_byte(0x0F);
1608   emit_byte(0x6E);
1609   emit_operand(dst, src);
1610 }
1611 
1612 
1613 void Assembler::movdqa(XMMRegister dst, Address src) {
1614   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1615   InstructionMark im(this);
1616   emit_byte(0x66);
1617   prefix(src, dst);
1618   emit_byte(0x0F);
1619   emit_byte(0x6F);
1620   emit_operand(dst, src);
1621 }
1622 
1623 void Assembler::movdqa(XMMRegister dst, XMMRegister src) {
1624   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1625   emit_byte(0x66);
1626   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
1627   emit_byte(0x0F);
1628   emit_byte(0x6F);
1629   emit_byte(0xC0 | encode);
1630 }
1631 
1632 void Assembler::movdqa(Address dst, XMMRegister src) {
1633   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1634   InstructionMark im(this);
1635   emit_byte(0x66);
1636   prefix(dst, src);
1637   emit_byte(0x0F);
1638   emit_byte(0x7F);
1639   emit_operand(src, dst);
1640 }
1641 
1642 void Assembler::movdqu(XMMRegister dst, Address src) {
1643   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1644   InstructionMark im(this);
1645   emit_byte(0xF3);
1646   prefix(src, dst);
1647   emit_byte(0x0F);
1648   emit_byte(0x6F);
1649   emit_operand(dst, src);
1650 }
1651 
1652 void Assembler::movdqu(XMMRegister dst, XMMRegister src) {
1653   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1654   emit_byte(0xF3);
1655   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
1656   emit_byte(0x0F);
1657   emit_byte(0x6F);
1658   emit_byte(0xC0 | encode);
1659 }
1660 
1661 void Assembler::movdqu(Address dst, XMMRegister src) {
1662   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1663   InstructionMark im(this);
1664   emit_byte(0xF3);
1665   prefix(dst, src);
1666   emit_byte(0x0F);
1667   emit_byte(0x7F);
1668   emit_operand(src, dst);
1669 }
1670 
1671 // Uses zero extension on 64bit
1672 
1673 void Assembler::movl(Register dst, int32_t imm32) {
1674   int encode = prefix_and_encode(dst->encoding());
1675   emit_byte(0xB8 | encode);
1676   emit_long(imm32);
1677 }
1678 
1679 void Assembler::movl(Register dst, Register src) {
1680   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1681   emit_byte(0x8B);
1682   emit_byte(0xC0 | encode);
1683 }
1684 
1685 void Assembler::movl(Register dst, Address src) {
1686   InstructionMark im(this);
1687   prefix(src, dst);
1688   emit_byte(0x8B);
1689   emit_operand(dst, src);
1690 }
1691 
1692 void Assembler::movl(Address dst, int32_t imm32) {
1693   InstructionMark im(this);
1694   prefix(dst);
1695   emit_byte(0xC7);
1696   emit_operand(rax, dst, 4);
1697   emit_long(imm32);
1698 }
1699 
1700 void Assembler::movl(Address dst, Register src) {
1701   InstructionMark im(this);
1702   prefix(dst, src);
1703   emit_byte(0x89);
1704   emit_operand(src, dst);
1705 }
1706 
1707 // New cpus require to use movsd and movss to avoid partial register stall
1708 // when loading from memory. But for old Opteron use movlpd instead of movsd.
1709 // The selection is done in MacroAssembler::movdbl() and movflt().
1710 void Assembler::movlpd(XMMRegister dst, Address src) {
1711   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1712   InstructionMark im(this);
1713   emit_byte(0x66);
1714   prefix(src, dst);
1715   emit_byte(0x0F);
1716   emit_byte(0x12);
1717   emit_operand(dst, src);
1718 }
1719 
1720 void Assembler::movq( MMXRegister dst, Address src ) {
1721   assert( VM_Version::supports_mmx(), "" );
1722   emit_byte(0x0F);
1723   emit_byte(0x6F);
1724   emit_operand(dst, src);
1725 }
1726 
1727 void Assembler::movq( Address dst, MMXRegister src ) {
1728   assert( VM_Version::supports_mmx(), "" );
1729   emit_byte(0x0F);
1730   emit_byte(0x7F);
1731   // workaround gcc (3.2.1-7a) bug
1732   // In that version of gcc with only an emit_operand(MMX, Address)
1733   // gcc will tail jump and try and reverse the parameters completely
1734   // obliterating dst in the process. By having a version available
1735   // that doesn't need to swap the args at the tail jump the bug is
1736   // avoided.
1737   emit_operand(dst, src);
1738 }
1739 
1740 void Assembler::movq(XMMRegister dst, Address src) {
1741   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1742   InstructionMark im(this);
1743   emit_byte(0xF3);
1744   prefix(src, dst);
1745   emit_byte(0x0F);
1746   emit_byte(0x7E);
1747   emit_operand(dst, src);
1748 }
1749 
1750 void Assembler::movq(Address dst, XMMRegister src) {
1751   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1752   InstructionMark im(this);
1753   emit_byte(0x66);
1754   prefix(dst, src);
1755   emit_byte(0x0F);
1756   emit_byte(0xD6);
1757   emit_operand(src, dst);
1758 }
1759 
1760 void Assembler::movsbl(Register dst, Address src) { // movsxb
1761   InstructionMark im(this);
1762   prefix(src, dst);
1763   emit_byte(0x0F);
1764   emit_byte(0xBE);
1765   emit_operand(dst, src);
1766 }
1767 
1768 void Assembler::movsbl(Register dst, Register src) { // movsxb
1769   NOT_LP64(assert(src->has_byte_register(), "must have byte register"));
1770   int encode = prefix_and_encode(dst->encoding(), src->encoding(), true);
1771   emit_byte(0x0F);
1772   emit_byte(0xBE);
1773   emit_byte(0xC0 | encode);
1774 }
1775 
1776 void Assembler::movsd(XMMRegister dst, XMMRegister src) {
1777   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1778   emit_byte(0xF2);
1779   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1780   emit_byte(0x0F);
1781   emit_byte(0x10);
1782   emit_byte(0xC0 | encode);
1783 }
1784 
1785 void Assembler::movsd(XMMRegister dst, Address src) {
1786   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1787   InstructionMark im(this);
1788   emit_byte(0xF2);
1789   prefix(src, dst);
1790   emit_byte(0x0F);
1791   emit_byte(0x10);
1792   emit_operand(dst, src);
1793 }
1794 
1795 void Assembler::movsd(Address dst, XMMRegister src) {
1796   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1797   InstructionMark im(this);
1798   emit_byte(0xF2);
1799   prefix(dst, src);
1800   emit_byte(0x0F);
1801   emit_byte(0x11);
1802   emit_operand(src, dst);
1803 }
1804 
1805 void Assembler::movss(XMMRegister dst, XMMRegister src) {
1806   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1807   emit_byte(0xF3);
1808   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1809   emit_byte(0x0F);
1810   emit_byte(0x10);
1811   emit_byte(0xC0 | encode);
1812 }
1813 
1814 void Assembler::movss(XMMRegister dst, Address src) {
1815   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1816   InstructionMark im(this);
1817   emit_byte(0xF3);
1818   prefix(src, dst);
1819   emit_byte(0x0F);
1820   emit_byte(0x10);
1821   emit_operand(dst, src);
1822 }
1823 
1824 void Assembler::movss(Address dst, XMMRegister src) {
1825   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1826   InstructionMark im(this);
1827   emit_byte(0xF3);
1828   prefix(dst, src);
1829   emit_byte(0x0F);
1830   emit_byte(0x11);
1831   emit_operand(src, dst);
1832 }
1833 
1834 void Assembler::movswl(Register dst, Address src) { // movsxw
1835   InstructionMark im(this);
1836   prefix(src, dst);
1837   emit_byte(0x0F);
1838   emit_byte(0xBF);
1839   emit_operand(dst, src);
1840 }
1841 
1842 void Assembler::movswl(Register dst, Register src) { // movsxw
1843   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1844   emit_byte(0x0F);
1845   emit_byte(0xBF);
1846   emit_byte(0xC0 | encode);
1847 }
1848 
1849 void Assembler::movw(Address dst, int imm16) {
1850   InstructionMark im(this);
1851 
1852   emit_byte(0x66); // switch to 16-bit mode
1853   prefix(dst);
1854   emit_byte(0xC7);
1855   emit_operand(rax, dst, 2);
1856   emit_word(imm16);
1857 }
1858 
1859 void Assembler::movw(Register dst, Address src) {
1860   InstructionMark im(this);
1861   emit_byte(0x66);
1862   prefix(src, dst);
1863   emit_byte(0x8B);
1864   emit_operand(dst, src);
1865 }
1866 
1867 void Assembler::movw(Address dst, Register src) {
1868   InstructionMark im(this);
1869   emit_byte(0x66);
1870   prefix(dst, src);
1871   emit_byte(0x89);
1872   emit_operand(src, dst);
1873 }
1874 
1875 void Assembler::movzbl(Register dst, Address src) { // movzxb
1876   InstructionMark im(this);
1877   prefix(src, dst);
1878   emit_byte(0x0F);
1879   emit_byte(0xB6);
1880   emit_operand(dst, src);
1881 }
1882 
1883 void Assembler::movzbl(Register dst, Register src) { // movzxb
1884   NOT_LP64(assert(src->has_byte_register(), "must have byte register"));
1885   int encode = prefix_and_encode(dst->encoding(), src->encoding(), true);
1886   emit_byte(0x0F);
1887   emit_byte(0xB6);
1888   emit_byte(0xC0 | encode);
1889 }
1890 
1891 void Assembler::movzwl(Register dst, Address src) { // movzxw
1892   InstructionMark im(this);
1893   prefix(src, dst);
1894   emit_byte(0x0F);
1895   emit_byte(0xB7);
1896   emit_operand(dst, src);
1897 }
1898 
1899 void Assembler::movzwl(Register dst, Register src) { // movzxw
1900   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1901   emit_byte(0x0F);
1902   emit_byte(0xB7);
1903   emit_byte(0xC0 | encode);
1904 }
1905 
1906 void Assembler::mull(Address src) {
1907   InstructionMark im(this);
1908   prefix(src);
1909   emit_byte(0xF7);
1910   emit_operand(rsp, src);
1911 }
1912 
1913 void Assembler::mull(Register src) {
1914   int encode = prefix_and_encode(src->encoding());
1915   emit_byte(0xF7);
1916   emit_byte(0xE0 | encode);
1917 }
1918 
1919 void Assembler::mulsd(XMMRegister dst, Address src) {
1920   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1921   InstructionMark im(this);
1922   emit_byte(0xF2);
1923   prefix(src, dst);
1924   emit_byte(0x0F);
1925   emit_byte(0x59);
1926   emit_operand(dst, src);
1927 }
1928 
1929 void Assembler::mulsd(XMMRegister dst, XMMRegister src) {
1930   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1931   emit_byte(0xF2);
1932   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1933   emit_byte(0x0F);
1934   emit_byte(0x59);
1935   emit_byte(0xC0 | encode);
1936 }
1937 
1938 void Assembler::mulss(XMMRegister dst, Address src) {
1939   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1940   InstructionMark im(this);
1941   emit_byte(0xF3);
1942   prefix(src, dst);
1943   emit_byte(0x0F);
1944   emit_byte(0x59);
1945   emit_operand(dst, src);
1946 }
1947 
1948 void Assembler::mulss(XMMRegister dst, XMMRegister src) {
1949   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1950   emit_byte(0xF3);
1951   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1952   emit_byte(0x0F);
1953   emit_byte(0x59);
1954   emit_byte(0xC0 | encode);
1955 }
1956 
1957 void Assembler::negl(Register dst) {
1958   int encode = prefix_and_encode(dst->encoding());
1959   emit_byte(0xF7);
1960   emit_byte(0xD8 | encode);
1961 }
1962 
1963 void Assembler::nop(int i) {
1964 #ifdef ASSERT
1965   assert(i > 0, " ");
1966   // The fancy nops aren't currently recognized by debuggers making it a
1967   // pain to disassemble code while debugging. If asserts are on clearly
1968   // speed is not an issue so simply use the single byte traditional nop
1969   // to do alignment.
1970 
1971   for (; i > 0 ; i--) emit_byte(0x90);
1972   return;
1973 
1974 #endif // ASSERT
1975 
1976   if (UseAddressNop && VM_Version::is_intel()) {
1977     //
1978     // Using multi-bytes nops "0x0F 0x1F [address]" for Intel
1979     //  1: 0x90
1980     //  2: 0x66 0x90
1981     //  3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
1982     //  4: 0x0F 0x1F 0x40 0x00
1983     //  5: 0x0F 0x1F 0x44 0x00 0x00
1984     //  6: 0x66 0x0F 0x1F 0x44 0x00 0x00
1985     //  7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
1986     //  8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
1987     //  9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
1988     // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
1989     // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
1990 
1991     // The rest coding is Intel specific - don't use consecutive address nops
1992 
1993     // 12: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
1994     // 13: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
1995     // 14: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
1996     // 15: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
1997 
1998     while(i >= 15) {
1999       // For Intel don't generate consecutive addess nops (mix with regular nops)
2000       i -= 15;
2001       emit_byte(0x66);   // size prefix
2002       emit_byte(0x66);   // size prefix
2003       emit_byte(0x66);   // size prefix
2004       addr_nop_8();
2005       emit_byte(0x66);   // size prefix
2006       emit_byte(0x66);   // size prefix
2007       emit_byte(0x66);   // size prefix
2008       emit_byte(0x90);   // nop
2009     }
2010     switch (i) {
2011       case 14:
2012         emit_byte(0x66); // size prefix
2013       case 13:
2014         emit_byte(0x66); // size prefix
2015       case 12:
2016         addr_nop_8();
2017         emit_byte(0x66); // size prefix
2018         emit_byte(0x66); // size prefix
2019         emit_byte(0x66); // size prefix
2020         emit_byte(0x90); // nop
2021         break;
2022       case 11:
2023         emit_byte(0x66); // size prefix
2024       case 10:
2025         emit_byte(0x66); // size prefix
2026       case 9:
2027         emit_byte(0x66); // size prefix
2028       case 8:
2029         addr_nop_8();
2030         break;
2031       case 7:
2032         addr_nop_7();
2033         break;
2034       case 6:
2035         emit_byte(0x66); // size prefix
2036       case 5:
2037         addr_nop_5();
2038         break;
2039       case 4:
2040         addr_nop_4();
2041         break;
2042       case 3:
2043         // Don't use "0x0F 0x1F 0x00" - need patching safe padding
2044         emit_byte(0x66); // size prefix
2045       case 2:
2046         emit_byte(0x66); // size prefix
2047       case 1:
2048         emit_byte(0x90); // nop
2049         break;
2050       default:
2051         assert(i == 0, " ");
2052     }
2053     return;
2054   }
2055   if (UseAddressNop && VM_Version::is_amd()) {
2056     //
2057     // Using multi-bytes nops "0x0F 0x1F [address]" for AMD.
2058     //  1: 0x90
2059     //  2: 0x66 0x90
2060     //  3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
2061     //  4: 0x0F 0x1F 0x40 0x00
2062     //  5: 0x0F 0x1F 0x44 0x00 0x00
2063     //  6: 0x66 0x0F 0x1F 0x44 0x00 0x00
2064     //  7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2065     //  8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2066     //  9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2067     // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2068     // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2069 
2070     // The rest coding is AMD specific - use consecutive address nops
2071 
2072     // 12: 0x66 0x0F 0x1F 0x44 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2073     // 13: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2074     // 14: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2075     // 15: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2076     // 16: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2077     //     Size prefixes (0x66) are added for larger sizes
2078 
2079     while(i >= 22) {
2080       i -= 11;
2081       emit_byte(0x66); // size prefix
2082       emit_byte(0x66); // size prefix
2083       emit_byte(0x66); // size prefix
2084       addr_nop_8();
2085     }
2086     // Generate first nop for size between 21-12
2087     switch (i) {
2088       case 21:
2089         i -= 1;
2090         emit_byte(0x66); // size prefix
2091       case 20:
2092       case 19:
2093         i -= 1;
2094         emit_byte(0x66); // size prefix
2095       case 18:
2096       case 17:
2097         i -= 1;
2098         emit_byte(0x66); // size prefix
2099       case 16:
2100       case 15:
2101         i -= 8;
2102         addr_nop_8();
2103         break;
2104       case 14:
2105       case 13:
2106         i -= 7;
2107         addr_nop_7();
2108         break;
2109       case 12:
2110         i -= 6;
2111         emit_byte(0x66); // size prefix
2112         addr_nop_5();
2113         break;
2114       default:
2115         assert(i < 12, " ");
2116     }
2117 
2118     // Generate second nop for size between 11-1
2119     switch (i) {
2120       case 11:
2121         emit_byte(0x66); // size prefix
2122       case 10:
2123         emit_byte(0x66); // size prefix
2124       case 9:
2125         emit_byte(0x66); // size prefix
2126       case 8:
2127         addr_nop_8();
2128         break;
2129       case 7:
2130         addr_nop_7();
2131         break;
2132       case 6:
2133         emit_byte(0x66); // size prefix
2134       case 5:
2135         addr_nop_5();
2136         break;
2137       case 4:
2138         addr_nop_4();
2139         break;
2140       case 3:
2141         // Don't use "0x0F 0x1F 0x00" - need patching safe padding
2142         emit_byte(0x66); // size prefix
2143       case 2:
2144         emit_byte(0x66); // size prefix
2145       case 1:
2146         emit_byte(0x90); // nop
2147         break;
2148       default:
2149         assert(i == 0, " ");
2150     }
2151     return;
2152   }
2153 
2154   // Using nops with size prefixes "0x66 0x90".
2155   // From AMD Optimization Guide:
2156   //  1: 0x90
2157   //  2: 0x66 0x90
2158   //  3: 0x66 0x66 0x90
2159   //  4: 0x66 0x66 0x66 0x90
2160   //  5: 0x66 0x66 0x90 0x66 0x90
2161   //  6: 0x66 0x66 0x90 0x66 0x66 0x90
2162   //  7: 0x66 0x66 0x66 0x90 0x66 0x66 0x90
2163   //  8: 0x66 0x66 0x66 0x90 0x66 0x66 0x66 0x90
2164   //  9: 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2165   // 10: 0x66 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2166   //
2167   while(i > 12) {
2168     i -= 4;
2169     emit_byte(0x66); // size prefix
2170     emit_byte(0x66);
2171     emit_byte(0x66);
2172     emit_byte(0x90); // nop
2173   }
2174   // 1 - 12 nops
2175   if(i > 8) {
2176     if(i > 9) {
2177       i -= 1;
2178       emit_byte(0x66);
2179     }
2180     i -= 3;
2181     emit_byte(0x66);
2182     emit_byte(0x66);
2183     emit_byte(0x90);
2184   }
2185   // 1 - 8 nops
2186   if(i > 4) {
2187     if(i > 6) {
2188       i -= 1;
2189       emit_byte(0x66);
2190     }
2191     i -= 3;
2192     emit_byte(0x66);
2193     emit_byte(0x66);
2194     emit_byte(0x90);
2195   }
2196   switch (i) {
2197     case 4:
2198       emit_byte(0x66);
2199     case 3:
2200       emit_byte(0x66);
2201     case 2:
2202       emit_byte(0x66);
2203     case 1:
2204       emit_byte(0x90);
2205       break;
2206     default:
2207       assert(i == 0, " ");
2208   }
2209 }
2210 
2211 void Assembler::notl(Register dst) {
2212   int encode = prefix_and_encode(dst->encoding());
2213   emit_byte(0xF7);
2214   emit_byte(0xD0 | encode );
2215 }
2216 
2217 void Assembler::orl(Address dst, int32_t imm32) {
2218   InstructionMark im(this);
2219   prefix(dst);
2220   emit_arith_operand(0x81, rcx, dst, imm32);
2221 }
2222 
2223 void Assembler::orl(Register dst, int32_t imm32) {
2224   prefix(dst);
2225   emit_arith(0x81, 0xC8, dst, imm32);
2226 }
2227 
2228 void Assembler::orl(Register dst, Address src) {
2229   InstructionMark im(this);
2230   prefix(src, dst);
2231   emit_byte(0x0B);
2232   emit_operand(dst, src);
2233 }
2234 
2235 void Assembler::orl(Register dst, Register src) {
2236   (void) prefix_and_encode(dst->encoding(), src->encoding());
2237   emit_arith(0x0B, 0xC0, dst, src);
2238 }
2239 
2240 void Assembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
2241   assert(VM_Version::supports_sse4_2(), "");
2242 
2243   InstructionMark im(this);
2244   emit_byte(0x66);
2245   prefix(src, dst);
2246   emit_byte(0x0F);
2247   emit_byte(0x3A);
2248   emit_byte(0x61);
2249   emit_operand(dst, src);
2250   emit_byte(imm8);
2251 }
2252 
2253 void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
2254   assert(VM_Version::supports_sse4_2(), "");
2255 
2256   emit_byte(0x66);
2257   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
2258   emit_byte(0x0F);
2259   emit_byte(0x3A);
2260   emit_byte(0x61);
2261   emit_byte(0xC0 | encode);
2262   emit_byte(imm8);
2263 }
2264 
2265 // generic
2266 void Assembler::pop(Register dst) {
2267   int encode = prefix_and_encode(dst->encoding());
2268   emit_byte(0x58 | encode);
2269 }
2270 
2271 void Assembler::popcntl(Register dst, Address src) {
2272   assert(VM_Version::supports_popcnt(), "must support");
2273   InstructionMark im(this);
2274   emit_byte(0xF3);
2275   prefix(src, dst);
2276   emit_byte(0x0F);
2277   emit_byte(0xB8);
2278   emit_operand(dst, src);
2279 }
2280 
2281 void Assembler::popcntl(Register dst, Register src) {
2282   assert(VM_Version::supports_popcnt(), "must support");
2283   emit_byte(0xF3);
2284   int encode = prefix_and_encode(dst->encoding(), src->encoding());
2285   emit_byte(0x0F);
2286   emit_byte(0xB8);
2287   emit_byte(0xC0 | encode);
2288 }
2289 
2290 void Assembler::popf() {
2291   emit_byte(0x9D);
2292 }
2293 
2294 #ifndef _LP64 // no 32bit push/pop on amd64
2295 void Assembler::popl(Address dst) {
2296   // NOTE: this will adjust stack by 8byte on 64bits
2297   InstructionMark im(this);
2298   prefix(dst);
2299   emit_byte(0x8F);
2300   emit_operand(rax, dst);
2301 }
2302 #endif
2303 
2304 void Assembler::prefetch_prefix(Address src) {
2305   prefix(src);
2306   emit_byte(0x0F);
2307 }
2308 
2309 void Assembler::prefetchnta(Address src) {
2310   NOT_LP64(assert(VM_Version::supports_sse2(), "must support"));
2311   InstructionMark im(this);
2312   prefetch_prefix(src);
2313   emit_byte(0x18);
2314   emit_operand(rax, src); // 0, src
2315 }
2316 
2317 void Assembler::prefetchr(Address src) {
2318   assert(VM_Version::supports_3dnow_prefetch(), "must support");
2319   InstructionMark im(this);
2320   prefetch_prefix(src);
2321   emit_byte(0x0D);
2322   emit_operand(rax, src); // 0, src
2323 }
2324 
2325 void Assembler::prefetcht0(Address src) {
2326   NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2327   InstructionMark im(this);
2328   prefetch_prefix(src);
2329   emit_byte(0x18);
2330   emit_operand(rcx, src); // 1, src
2331 }
2332 
2333 void Assembler::prefetcht1(Address src) {
2334   NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2335   InstructionMark im(this);
2336   prefetch_prefix(src);
2337   emit_byte(0x18);
2338   emit_operand(rdx, src); // 2, src
2339 }
2340 
2341 void Assembler::prefetcht2(Address src) {
2342   NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2343   InstructionMark im(this);
2344   prefetch_prefix(src);
2345   emit_byte(0x18);
2346   emit_operand(rbx, src); // 3, src
2347 }
2348 
2349 void Assembler::prefetchw(Address src) {
2350   assert(VM_Version::supports_3dnow_prefetch(), "must support");
2351   InstructionMark im(this);
2352   prefetch_prefix(src);
2353   emit_byte(0x0D);
2354   emit_operand(rcx, src); // 1, src
2355 }
2356 
2357 void Assembler::prefix(Prefix p) {
2358   a_byte(p);
2359 }
2360 
2361 void Assembler::por(XMMRegister dst, XMMRegister src) {
2362   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2363 
2364   emit_byte(0x66);
2365   int  encode = prefix_and_encode(dst->encoding(), src->encoding());
2366   emit_byte(0x0F);
2367 
2368   emit_byte(0xEB);
2369   emit_byte(0xC0 | encode);
2370 }
2371 
2372 void Assembler::pshufd(XMMRegister dst, XMMRegister src, int mode) {
2373   assert(isByte(mode), "invalid value");
2374   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2375 
2376   emit_byte(0x66);
2377   int encode = prefix_and_encode(dst->encoding(), src->encoding());
2378   emit_byte(0x0F);
2379   emit_byte(0x70);
2380   emit_byte(0xC0 | encode);
2381   emit_byte(mode & 0xFF);
2382 
2383 }
2384 
2385 void Assembler::pshufd(XMMRegister dst, Address src, int mode) {
2386   assert(isByte(mode), "invalid value");
2387   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2388 
2389   InstructionMark im(this);
2390   emit_byte(0x66);
2391   prefix(src, dst);
2392   emit_byte(0x0F);
2393   emit_byte(0x70);
2394   emit_operand(dst, src);
2395   emit_byte(mode & 0xFF);
2396 }
2397 
2398 void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
2399   assert(isByte(mode), "invalid value");
2400   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2401 
2402   emit_byte(0xF2);
2403   int encode = prefix_and_encode(dst->encoding(), src->encoding());
2404   emit_byte(0x0F);
2405   emit_byte(0x70);
2406   emit_byte(0xC0 | encode);
2407   emit_byte(mode & 0xFF);
2408 }
2409 
2410 void Assembler::pshuflw(XMMRegister dst, Address src, int mode) {
2411   assert(isByte(mode), "invalid value");
2412   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2413 
2414   InstructionMark im(this);
2415   emit_byte(0xF2);
2416   prefix(src, dst); // QQ new
2417   emit_byte(0x0F);
2418   emit_byte(0x70);
2419   emit_operand(dst, src);
2420   emit_byte(mode & 0xFF);
2421 }
2422 
2423 void Assembler::psrlq(XMMRegister dst, int shift) {
2424   // Shift 64 bit value logically right by specified number of bits.
2425   // HMM Table D-1 says sse2 or mmx.
2426   // Do not confuse it with psrldq SSE2 instruction which
2427   // shifts 128 bit value in xmm register by number of bytes.
2428   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2429 
2430   int encode = prefixq_and_encode(xmm2->encoding(), dst->encoding());
2431   emit_byte(0x66);
2432   emit_byte(0x0F);
2433   emit_byte(0x73);
2434   emit_byte(0xC0 | encode);
2435   emit_byte(shift);
2436 }
2437 
2438 void Assembler::psrldq(XMMRegister dst, int shift) {
2439   // Shift 128 bit value in xmm register by number of bytes.
2440   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2441 
2442   int encode = prefixq_and_encode(xmm3->encoding(), dst->encoding());
2443   emit_byte(0x66);
2444   emit_byte(0x0F);
2445   emit_byte(0x73);
2446   emit_byte(0xC0 | encode);
2447   emit_byte(shift);
2448 }
2449 
2450 void Assembler::ptest(XMMRegister dst, Address src) {
2451   assert(VM_Version::supports_sse4_1(), "");
2452 
2453   InstructionMark im(this);
2454   emit_byte(0x66);
2455   prefix(src, dst);
2456   emit_byte(0x0F);
2457   emit_byte(0x38);
2458   emit_byte(0x17);
2459   emit_operand(dst, src);
2460 }
2461 
2462 void Assembler::ptest(XMMRegister dst, XMMRegister src) {
2463   assert(VM_Version::supports_sse4_1(), "");
2464 
2465   emit_byte(0x66);
2466   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
2467   emit_byte(0x0F);
2468   emit_byte(0x38);
2469   emit_byte(0x17);
2470   emit_byte(0xC0 | encode);
2471 }
2472 
2473 void Assembler::punpcklbw(XMMRegister dst, XMMRegister src) {
2474   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2475   emit_byte(0x66);
2476   int encode = prefix_and_encode(dst->encoding(), src->encoding());
2477   emit_byte(0x0F);
2478   emit_byte(0x60);
2479   emit_byte(0xC0 | encode);
2480 }
2481 
2482 void Assembler::push(int32_t imm32) {
2483   // in 64bits we push 64bits onto the stack but only
2484   // take a 32bit immediate
2485   emit_byte(0x68);
2486   emit_long(imm32);
2487 }
2488 
2489 void Assembler::push(Register src) {
2490   int encode = prefix_and_encode(src->encoding());
2491 
2492   emit_byte(0x50 | encode);
2493 }
2494 
2495 void Assembler::pushf() {
2496   emit_byte(0x9C);
2497 }
2498 
2499 #ifndef _LP64 // no 32bit push/pop on amd64
2500 void Assembler::pushl(Address src) {
2501   // Note this will push 64bit on 64bit
2502   InstructionMark im(this);
2503   prefix(src);
2504   emit_byte(0xFF);
2505   emit_operand(rsi, src);
2506 }
2507 #endif
2508 
2509 void Assembler::pxor(XMMRegister dst, Address src) {
2510   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2511   InstructionMark im(this);
2512   emit_byte(0x66);
2513   prefix(src, dst);
2514   emit_byte(0x0F);
2515   emit_byte(0xEF);
2516   emit_operand(dst, src);
2517 }
2518 
2519 void Assembler::pxor(XMMRegister dst, XMMRegister src) {
2520   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2521   InstructionMark im(this);
2522   emit_byte(0x66);
2523   int encode = prefix_and_encode(dst->encoding(), src->encoding());
2524   emit_byte(0x0F);
2525   emit_byte(0xEF);
2526   emit_byte(0xC0 | encode);
2527 }
2528 
2529 void Assembler::rcll(Register dst, int imm8) {
2530   assert(isShiftCount(imm8), "illegal shift count");
2531   int encode = prefix_and_encode(dst->encoding());
2532   if (imm8 == 1) {
2533     emit_byte(0xD1);
2534     emit_byte(0xD0 | encode);
2535   } else {
2536     emit_byte(0xC1);
2537     emit_byte(0xD0 | encode);
2538     emit_byte(imm8);
2539   }
2540 }
2541 
2542 // copies data from [esi] to [edi] using rcx pointer sized words
2543 // generic
2544 void Assembler::rep_mov() {
2545   emit_byte(0xF3);
2546   // MOVSQ
2547   LP64_ONLY(prefix(REX_W));
2548   emit_byte(0xA5);
2549 }
2550 
2551 // sets rcx pointer sized words with rax, value at [edi]
2552 // generic
2553 void Assembler::rep_set() { // rep_set
2554   emit_byte(0xF3);
2555   // STOSQ
2556   LP64_ONLY(prefix(REX_W));
2557   emit_byte(0xAB);
2558 }
2559 
2560 // scans rcx pointer sized words at [edi] for occurance of rax,
2561 // generic
2562 void Assembler::repne_scan() { // repne_scan
2563   emit_byte(0xF2);
2564   // SCASQ
2565   LP64_ONLY(prefix(REX_W));
2566   emit_byte(0xAF);
2567 }
2568 
2569 #ifdef _LP64
2570 // scans rcx 4 byte words at [edi] for occurance of rax,
2571 // generic
2572 void Assembler::repne_scanl() { // repne_scan
2573   emit_byte(0xF2);
2574   // SCASL
2575   emit_byte(0xAF);
2576 }
2577 #endif
2578 
2579 void Assembler::ret(int imm16) {
2580   if (imm16 == 0) {
2581     emit_byte(0xC3);
2582   } else {
2583     emit_byte(0xC2);
2584     emit_word(imm16);
2585   }
2586 }
2587 
2588 void Assembler::sahf() {
2589 #ifdef _LP64
2590   // Not supported in 64bit mode
2591   ShouldNotReachHere();
2592 #endif
2593   emit_byte(0x9E);
2594 }
2595 
2596 void Assembler::sarl(Register dst, int imm8) {
2597   int encode = prefix_and_encode(dst->encoding());
2598   assert(isShiftCount(imm8), "illegal shift count");
2599   if (imm8 == 1) {
2600     emit_byte(0xD1);
2601     emit_byte(0xF8 | encode);
2602   } else {
2603     emit_byte(0xC1);
2604     emit_byte(0xF8 | encode);
2605     emit_byte(imm8);
2606   }
2607 }
2608 
2609 void Assembler::sarl(Register dst) {
2610   int encode = prefix_and_encode(dst->encoding());
2611   emit_byte(0xD3);
2612   emit_byte(0xF8 | encode);
2613 }
2614 
2615 void Assembler::sbbl(Address dst, int32_t imm32) {
2616   InstructionMark im(this);
2617   prefix(dst);
2618   emit_arith_operand(0x81, rbx, dst, imm32);
2619 }
2620 
2621 void Assembler::sbbl(Register dst, int32_t imm32) {
2622   prefix(dst);
2623   emit_arith(0x81, 0xD8, dst, imm32);
2624 }
2625 
2626 
2627 void Assembler::sbbl(Register dst, Address src) {
2628   InstructionMark im(this);
2629   prefix(src, dst);
2630   emit_byte(0x1B);
2631   emit_operand(dst, src);
2632 }
2633 
2634 void Assembler::sbbl(Register dst, Register src) {
2635   (void) prefix_and_encode(dst->encoding(), src->encoding());
2636   emit_arith(0x1B, 0xC0, dst, src);
2637 }
2638 
2639 void Assembler::setb(Condition cc, Register dst) {
2640   assert(0 <= cc && cc < 16, "illegal cc");
2641   int encode = prefix_and_encode(dst->encoding(), true);
2642   emit_byte(0x0F);
2643   emit_byte(0x90 | cc);
2644   emit_byte(0xC0 | encode);
2645 }
2646 
2647 void Assembler::shll(Register dst, int imm8) {
2648   assert(isShiftCount(imm8), "illegal shift count");
2649   int encode = prefix_and_encode(dst->encoding());
2650   if (imm8 == 1 ) {
2651     emit_byte(0xD1);
2652     emit_byte(0xE0 | encode);
2653   } else {
2654     emit_byte(0xC1);
2655     emit_byte(0xE0 | encode);
2656     emit_byte(imm8);
2657   }
2658 }
2659 
2660 void Assembler::shll(Register dst) {
2661   int encode = prefix_and_encode(dst->encoding());
2662   emit_byte(0xD3);
2663   emit_byte(0xE0 | encode);
2664 }
2665 
2666 void Assembler::shrl(Register dst, int imm8) {
2667   assert(isShiftCount(imm8), "illegal shift count");
2668   int encode = prefix_and_encode(dst->encoding());
2669   emit_byte(0xC1);
2670   emit_byte(0xE8 | encode);
2671   emit_byte(imm8);
2672 }
2673 
2674 void Assembler::shrl(Register dst) {
2675   int encode = prefix_and_encode(dst->encoding());
2676   emit_byte(0xD3);
2677   emit_byte(0xE8 | encode);
2678 }
2679 
2680 // copies a single word from [esi] to [edi]
2681 void Assembler::smovl() {
2682   emit_byte(0xA5);
2683 }
2684 
2685 void Assembler::sqrtsd(XMMRegister dst, XMMRegister src) {
2686   // HMM Table D-1 says sse2
2687   // NOT_LP64(assert(VM_Version::supports_sse(), ""));
2688   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2689   emit_byte(0xF2);
2690   int encode = prefix_and_encode(dst->encoding(), src->encoding());
2691   emit_byte(0x0F);
2692   emit_byte(0x51);
2693   emit_byte(0xC0 | encode);
2694 }
2695 
2696 void Assembler::sqrtsd(XMMRegister dst, Address src) {
2697   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2698   InstructionMark im(this);
2699   emit_byte(0xF2);
2700   prefix(src, dst);
2701   emit_byte(0x0F);
2702   emit_byte(0x51);
2703   emit_operand(dst, src);
2704 }
2705 
2706 void Assembler::sqrtss(XMMRegister dst, XMMRegister src) {
2707   // HMM Table D-1 says sse2
2708   // NOT_LP64(assert(VM_Version::supports_sse(), ""));
2709   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2710   emit_byte(0xF3);
2711   int encode = prefix_and_encode(dst->encoding(), src->encoding());
2712   emit_byte(0x0F);
2713   emit_byte(0x51);
2714   emit_byte(0xC0 | encode);
2715 }
2716 
2717 void Assembler::sqrtss(XMMRegister dst, Address src) {
2718   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2719   InstructionMark im(this);
2720   emit_byte(0xF3);
2721   prefix(src, dst);
2722   emit_byte(0x0F);
2723   emit_byte(0x51);
2724   emit_operand(dst, src);
2725 }
2726 
2727 void Assembler::stmxcsr( Address dst) {
2728   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2729   InstructionMark im(this);
2730   prefix(dst);
2731   emit_byte(0x0F);
2732   emit_byte(0xAE);
2733   emit_operand(as_Register(3), dst);
2734 }
2735 
2736 void Assembler::subl(Address dst, int32_t imm32) {
2737   InstructionMark im(this);
2738   prefix(dst);
2739   emit_arith_operand(0x81, rbp, dst, imm32);
2740 }
2741 
2742 void Assembler::subl(Address dst, Register src) {
2743   InstructionMark im(this);
2744   prefix(dst, src);
2745   emit_byte(0x29);
2746   emit_operand(src, dst);
2747 }
2748 
2749 void Assembler::subl(Register dst, int32_t imm32) {
2750   prefix(dst);
2751   emit_arith(0x81, 0xE8, dst, imm32);
2752 }
2753 
2754 void Assembler::subl(Register dst, Address src) {
2755   InstructionMark im(this);
2756   prefix(src, dst);
2757   emit_byte(0x2B);
2758   emit_operand(dst, src);
2759 }
2760 
2761 void Assembler::subl(Register dst, Register src) {
2762   (void) prefix_and_encode(dst->encoding(), src->encoding());
2763   emit_arith(0x2B, 0xC0, dst, src);
2764 }
2765 
2766 void Assembler::subsd(XMMRegister dst, XMMRegister src) {
2767   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2768   emit_byte(0xF2);
2769   int encode = prefix_and_encode(dst->encoding(), src->encoding());
2770   emit_byte(0x0F);
2771   emit_byte(0x5C);
2772   emit_byte(0xC0 | encode);
2773 }
2774 
2775 void Assembler::subsd(XMMRegister dst, Address src) {
2776   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2777   InstructionMark im(this);
2778   emit_byte(0xF2);
2779   prefix(src, dst);
2780   emit_byte(0x0F);
2781   emit_byte(0x5C);
2782   emit_operand(dst, src);
2783 }
2784 
2785 void Assembler::subss(XMMRegister dst, XMMRegister src) {
2786   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2787   emit_byte(0xF3);
2788   int encode = prefix_and_encode(dst->encoding(), src->encoding());
2789   emit_byte(0x0F);
2790   emit_byte(0x5C);
2791   emit_byte(0xC0 | encode);
2792 }
2793 
2794 void Assembler::subss(XMMRegister dst, Address src) {
2795   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2796   InstructionMark im(this);
2797   emit_byte(0xF3);
2798   prefix(src, dst);
2799   emit_byte(0x0F);
2800   emit_byte(0x5C);
2801   emit_operand(dst, src);
2802 }
2803 
2804 void Assembler::testb(Register dst, int imm8) {
2805   NOT_LP64(assert(dst->has_byte_register(), "must have byte register"));
2806   (void) prefix_and_encode(dst->encoding(), true);
2807   emit_arith_b(0xF6, 0xC0, dst, imm8);
2808 }
2809 
2810 void Assembler::testl(Register dst, int32_t imm32) {
2811   // not using emit_arith because test
2812   // doesn't support sign-extension of
2813   // 8bit operands
2814   int encode = dst->encoding();
2815   if (encode == 0) {
2816     emit_byte(0xA9);
2817   } else {
2818     encode = prefix_and_encode(encode);
2819     emit_byte(0xF7);
2820     emit_byte(0xC0 | encode);
2821   }
2822   emit_long(imm32);
2823 }
2824 
2825 void Assembler::testl(Register dst, Register src) {
2826   (void) prefix_and_encode(dst->encoding(), src->encoding());
2827   emit_arith(0x85, 0xC0, dst, src);
2828 }
2829 
2830 void Assembler::testl(Register dst, Address  src) {
2831   InstructionMark im(this);
2832   prefix(src, dst);
2833   emit_byte(0x85);
2834   emit_operand(dst, src);
2835 }
2836 
2837 void Assembler::ucomisd(XMMRegister dst, Address src) {
2838   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2839   emit_byte(0x66);
2840   ucomiss(dst, src);
2841 }
2842 
2843 void Assembler::ucomisd(XMMRegister dst, XMMRegister src) {
2844   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2845   emit_byte(0x66);
2846   ucomiss(dst, src);
2847 }
2848 
2849 void Assembler::ucomiss(XMMRegister dst, Address src) {
2850   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2851 
2852   InstructionMark im(this);
2853   prefix(src, dst);
2854   emit_byte(0x0F);
2855   emit_byte(0x2E);
2856   emit_operand(dst, src);
2857 }
2858 
2859 void Assembler::ucomiss(XMMRegister dst, XMMRegister src) {
2860   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2861   int encode = prefix_and_encode(dst->encoding(), src->encoding());
2862   emit_byte(0x0F);
2863   emit_byte(0x2E);
2864   emit_byte(0xC0 | encode);
2865 }
2866 
2867 
2868 void Assembler::xaddl(Address dst, Register src) {
2869   InstructionMark im(this);
2870   prefix(dst, src);
2871   emit_byte(0x0F);
2872   emit_byte(0xC1);
2873   emit_operand(src, dst);
2874 }
2875 
2876 void Assembler::xchgl(Register dst, Address src) { // xchg
2877   InstructionMark im(this);
2878   prefix(src, dst);
2879   emit_byte(0x87);
2880   emit_operand(dst, src);
2881 }
2882 
2883 void Assembler::xchgl(Register dst, Register src) {
2884   int encode = prefix_and_encode(dst->encoding(), src->encoding());
2885   emit_byte(0x87);
2886   emit_byte(0xc0 | encode);
2887 }
2888 
2889 void Assembler::xorl(Register dst, int32_t imm32) {
2890   prefix(dst);
2891   emit_arith(0x81, 0xF0, dst, imm32);
2892 }
2893 
2894 void Assembler::xorl(Register dst, Address src) {
2895   InstructionMark im(this);
2896   prefix(src, dst);
2897   emit_byte(0x33);
2898   emit_operand(dst, src);
2899 }
2900 
2901 void Assembler::xorl(Register dst, Register src) {
2902   (void) prefix_and_encode(dst->encoding(), src->encoding());
2903   emit_arith(0x33, 0xC0, dst, src);
2904 }
2905 
2906 void Assembler::xorpd(XMMRegister dst, XMMRegister src) {
2907   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2908   emit_byte(0x66);
2909   xorps(dst, src);
2910 }
2911 
2912 void Assembler::xorpd(XMMRegister dst, Address src) {
2913   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2914   InstructionMark im(this);
2915   emit_byte(0x66);
2916   prefix(src, dst);
2917   emit_byte(0x0F);
2918   emit_byte(0x57);
2919   emit_operand(dst, src);
2920 }
2921 
2922 
2923 void Assembler::xorps(XMMRegister dst, XMMRegister src) {
2924   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2925   int encode = prefix_and_encode(dst->encoding(), src->encoding());
2926   emit_byte(0x0F);
2927   emit_byte(0x57);
2928   emit_byte(0xC0 | encode);
2929 }
2930 
2931 void Assembler::xorps(XMMRegister dst, Address src) {
2932   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2933   InstructionMark im(this);
2934   prefix(src, dst);
2935   emit_byte(0x0F);
2936   emit_byte(0x57);
2937   emit_operand(dst, src);
2938 }
2939 
2940 #ifndef _LP64
2941 // 32bit only pieces of the assembler
2942 
2943 void Assembler::cmp_literal32(Register src1, int32_t imm32, RelocationHolder const& rspec) {
2944   // NO PREFIX AS NEVER 64BIT
2945   InstructionMark im(this);
2946   emit_byte(0x81);
2947   emit_byte(0xF8 | src1->encoding());
2948   emit_data(imm32, rspec, 0);
2949 }
2950 
2951 void Assembler::cmp_literal32(Address src1, int32_t imm32, RelocationHolder const& rspec) {
2952   // NO PREFIX AS NEVER 64BIT (not even 32bit versions of 64bit regs
2953   InstructionMark im(this);
2954   emit_byte(0x81);
2955   emit_operand(rdi, src1);
2956   emit_data(imm32, rspec, 0);
2957 }
2958 
2959 // The 64-bit (32bit platform) cmpxchg compares the value at adr with the contents of rdx:rax,
2960 // and stores rcx:rbx into adr if so; otherwise, the value at adr is loaded
2961 // into rdx:rax.  The ZF is set if the compared values were equal, and cleared otherwise.
2962 void Assembler::cmpxchg8(Address adr) {
2963   InstructionMark im(this);
2964   emit_byte(0x0F);
2965   emit_byte(0xc7);
2966   emit_operand(rcx, adr);
2967 }
2968 
2969 void Assembler::decl(Register dst) {
2970   // Don't use it directly. Use MacroAssembler::decrementl() instead.
2971  emit_byte(0x48 | dst->encoding());
2972 }
2973 
2974 #endif // _LP64
2975 
2976 // 64bit typically doesn't use the x87 but needs to for the trig funcs
2977 
2978 void Assembler::fabs() {
2979   emit_byte(0xD9);
2980   emit_byte(0xE1);
2981 }
2982 
2983 void Assembler::fadd(int i) {
2984   emit_farith(0xD8, 0xC0, i);
2985 }
2986 
2987 void Assembler::fadd_d(Address src) {
2988   InstructionMark im(this);
2989   emit_byte(0xDC);
2990   emit_operand32(rax, src);
2991 }
2992 
2993 void Assembler::fadd_s(Address src) {
2994   InstructionMark im(this);
2995   emit_byte(0xD8);
2996   emit_operand32(rax, src);
2997 }
2998 
2999 void Assembler::fadda(int i) {
3000   emit_farith(0xDC, 0xC0, i);
3001 }
3002 
3003 void Assembler::faddp(int i) {
3004   emit_farith(0xDE, 0xC0, i);
3005 }
3006 
3007 void Assembler::fchs() {
3008   emit_byte(0xD9);
3009   emit_byte(0xE0);
3010 }
3011 
3012 void Assembler::fcom(int i) {
3013   emit_farith(0xD8, 0xD0, i);
3014 }
3015 
3016 void Assembler::fcomp(int i) {
3017   emit_farith(0xD8, 0xD8, i);
3018 }
3019 
3020 void Assembler::fcomp_d(Address src) {
3021   InstructionMark im(this);
3022   emit_byte(0xDC);
3023   emit_operand32(rbx, src);
3024 }
3025 
3026 void Assembler::fcomp_s(Address src) {
3027   InstructionMark im(this);
3028   emit_byte(0xD8);
3029   emit_operand32(rbx, src);
3030 }
3031 
3032 void Assembler::fcompp() {
3033   emit_byte(0xDE);
3034   emit_byte(0xD9);
3035 }
3036 
3037 void Assembler::fcos() {
3038   emit_byte(0xD9);
3039   emit_byte(0xFF);
3040 }
3041 
3042 void Assembler::fdecstp() {
3043   emit_byte(0xD9);
3044   emit_byte(0xF6);
3045 }
3046 
3047 void Assembler::fdiv(int i) {
3048   emit_farith(0xD8, 0xF0, i);
3049 }
3050 
3051 void Assembler::fdiv_d(Address src) {
3052   InstructionMark im(this);
3053   emit_byte(0xDC);
3054   emit_operand32(rsi, src);
3055 }
3056 
3057 void Assembler::fdiv_s(Address src) {
3058   InstructionMark im(this);
3059   emit_byte(0xD8);
3060   emit_operand32(rsi, src);
3061 }
3062 
3063 void Assembler::fdiva(int i) {
3064   emit_farith(0xDC, 0xF8, i);
3065 }
3066 
3067 // Note: The Intel manual (Pentium Processor User's Manual, Vol.3, 1994)
3068 //       is erroneous for some of the floating-point instructions below.
3069 
3070 void Assembler::fdivp(int i) {
3071   emit_farith(0xDE, 0xF8, i);                    // ST(0) <- ST(0) / ST(1) and pop (Intel manual wrong)
3072 }
3073 
3074 void Assembler::fdivr(int i) {
3075   emit_farith(0xD8, 0xF8, i);
3076 }
3077 
3078 void Assembler::fdivr_d(Address src) {
3079   InstructionMark im(this);
3080   emit_byte(0xDC);
3081   emit_operand32(rdi, src);
3082 }
3083 
3084 void Assembler::fdivr_s(Address src) {
3085   InstructionMark im(this);
3086   emit_byte(0xD8);
3087   emit_operand32(rdi, src);
3088 }
3089 
3090 void Assembler::fdivra(int i) {
3091   emit_farith(0xDC, 0xF0, i);
3092 }
3093 
3094 void Assembler::fdivrp(int i) {
3095   emit_farith(0xDE, 0xF0, i);                    // ST(0) <- ST(1) / ST(0) and pop (Intel manual wrong)
3096 }
3097 
3098 void Assembler::ffree(int i) {
3099   emit_farith(0xDD, 0xC0, i);
3100 }
3101 
3102 void Assembler::fild_d(Address adr) {
3103   InstructionMark im(this);
3104   emit_byte(0xDF);
3105   emit_operand32(rbp, adr);
3106 }
3107 
3108 void Assembler::fild_s(Address adr) {
3109   InstructionMark im(this);
3110   emit_byte(0xDB);
3111   emit_operand32(rax, adr);
3112 }
3113 
3114 void Assembler::fincstp() {
3115   emit_byte(0xD9);
3116   emit_byte(0xF7);
3117 }
3118 
3119 void Assembler::finit() {
3120   emit_byte(0x9B);
3121   emit_byte(0xDB);
3122   emit_byte(0xE3);
3123 }
3124 
3125 void Assembler::fist_s(Address adr) {
3126   InstructionMark im(this);
3127   emit_byte(0xDB);
3128   emit_operand32(rdx, adr);
3129 }
3130 
3131 void Assembler::fistp_d(Address adr) {
3132   InstructionMark im(this);
3133   emit_byte(0xDF);
3134   emit_operand32(rdi, adr);
3135 }
3136 
3137 void Assembler::fistp_s(Address adr) {
3138   InstructionMark im(this);
3139   emit_byte(0xDB);
3140   emit_operand32(rbx, adr);
3141 }
3142 
3143 void Assembler::fld1() {
3144   emit_byte(0xD9);
3145   emit_byte(0xE8);
3146 }
3147 
3148 void Assembler::fld_d(Address adr) {
3149   InstructionMark im(this);
3150   emit_byte(0xDD);
3151   emit_operand32(rax, adr);
3152 }
3153 
3154 void Assembler::fld_s(Address adr) {
3155   InstructionMark im(this);
3156   emit_byte(0xD9);
3157   emit_operand32(rax, adr);
3158 }
3159 
3160 
3161 void Assembler::fld_s(int index) {
3162   emit_farith(0xD9, 0xC0, index);
3163 }
3164 
3165 void Assembler::fld_x(Address adr) {
3166   InstructionMark im(this);
3167   emit_byte(0xDB);
3168   emit_operand32(rbp, adr);
3169 }
3170 
3171 void Assembler::fldcw(Address src) {
3172   InstructionMark im(this);
3173   emit_byte(0xd9);
3174   emit_operand32(rbp, src);
3175 }
3176 
3177 void Assembler::fldenv(Address src) {
3178   InstructionMark im(this);
3179   emit_byte(0xD9);
3180   emit_operand32(rsp, src);
3181 }
3182 
3183 void Assembler::fldlg2() {
3184   emit_byte(0xD9);
3185   emit_byte(0xEC);
3186 }
3187 
3188 void Assembler::fldln2() {
3189   emit_byte(0xD9);
3190   emit_byte(0xED);
3191 }
3192 
3193 void Assembler::fldz() {
3194   emit_byte(0xD9);
3195   emit_byte(0xEE);
3196 }
3197 
3198 void Assembler::flog() {
3199   fldln2();
3200   fxch();
3201   fyl2x();
3202 }
3203 
3204 void Assembler::flog10() {
3205   fldlg2();
3206   fxch();
3207   fyl2x();
3208 }
3209 
3210 void Assembler::fmul(int i) {
3211   emit_farith(0xD8, 0xC8, i);
3212 }
3213 
3214 void Assembler::fmul_d(Address src) {
3215   InstructionMark im(this);
3216   emit_byte(0xDC);
3217   emit_operand32(rcx, src);
3218 }
3219 
3220 void Assembler::fmul_s(Address src) {
3221   InstructionMark im(this);
3222   emit_byte(0xD8);
3223   emit_operand32(rcx, src);
3224 }
3225 
3226 void Assembler::fmula(int i) {
3227   emit_farith(0xDC, 0xC8, i);
3228 }
3229 
3230 void Assembler::fmulp(int i) {
3231   emit_farith(0xDE, 0xC8, i);
3232 }
3233 
3234 void Assembler::fnsave(Address dst) {
3235   InstructionMark im(this);
3236   emit_byte(0xDD);
3237   emit_operand32(rsi, dst);
3238 }
3239 
3240 void Assembler::fnstcw(Address src) {
3241   InstructionMark im(this);
3242   emit_byte(0x9B);
3243   emit_byte(0xD9);
3244   emit_operand32(rdi, src);
3245 }
3246 
3247 void Assembler::fnstsw_ax() {
3248   emit_byte(0xdF);
3249   emit_byte(0xE0);
3250 }
3251 
3252 void Assembler::fprem() {
3253   emit_byte(0xD9);
3254   emit_byte(0xF8);
3255 }
3256 
3257 void Assembler::fprem1() {
3258   emit_byte(0xD9);
3259   emit_byte(0xF5);
3260 }
3261 
3262 void Assembler::frstor(Address src) {
3263   InstructionMark im(this);
3264   emit_byte(0xDD);
3265   emit_operand32(rsp, src);
3266 }
3267 
3268 void Assembler::fsin() {
3269   emit_byte(0xD9);
3270   emit_byte(0xFE);
3271 }
3272 
3273 void Assembler::fsqrt() {
3274   emit_byte(0xD9);
3275   emit_byte(0xFA);
3276 }
3277 
3278 void Assembler::fst_d(Address adr) {
3279   InstructionMark im(this);
3280   emit_byte(0xDD);
3281   emit_operand32(rdx, adr);
3282 }
3283 
3284 void Assembler::fst_s(Address adr) {
3285   InstructionMark im(this);
3286   emit_byte(0xD9);
3287   emit_operand32(rdx, adr);
3288 }
3289 
3290 void Assembler::fstp_d(Address adr) {
3291   InstructionMark im(this);
3292   emit_byte(0xDD);
3293   emit_operand32(rbx, adr);
3294 }
3295 
3296 void Assembler::fstp_d(int index) {
3297   emit_farith(0xDD, 0xD8, index);
3298 }
3299 
3300 void Assembler::fstp_s(Address adr) {
3301   InstructionMark im(this);
3302   emit_byte(0xD9);
3303   emit_operand32(rbx, adr);
3304 }
3305 
3306 void Assembler::fstp_x(Address adr) {
3307   InstructionMark im(this);
3308   emit_byte(0xDB);
3309   emit_operand32(rdi, adr);
3310 }
3311 
3312 void Assembler::fsub(int i) {
3313   emit_farith(0xD8, 0xE0, i);
3314 }
3315 
3316 void Assembler::fsub_d(Address src) {
3317   InstructionMark im(this);
3318   emit_byte(0xDC);
3319   emit_operand32(rsp, src);
3320 }
3321 
3322 void Assembler::fsub_s(Address src) {
3323   InstructionMark im(this);
3324   emit_byte(0xD8);
3325   emit_operand32(rsp, src);
3326 }
3327 
3328 void Assembler::fsuba(int i) {
3329   emit_farith(0xDC, 0xE8, i);
3330 }
3331 
3332 void Assembler::fsubp(int i) {
3333   emit_farith(0xDE, 0xE8, i);                    // ST(0) <- ST(0) - ST(1) and pop (Intel manual wrong)
3334 }
3335 
3336 void Assembler::fsubr(int i) {
3337   emit_farith(0xD8, 0xE8, i);
3338 }
3339 
3340 void Assembler::fsubr_d(Address src) {
3341   InstructionMark im(this);
3342   emit_byte(0xDC);
3343   emit_operand32(rbp, src);
3344 }
3345 
3346 void Assembler::fsubr_s(Address src) {
3347   InstructionMark im(this);
3348   emit_byte(0xD8);
3349   emit_operand32(rbp, src);
3350 }
3351 
3352 void Assembler::fsubra(int i) {
3353   emit_farith(0xDC, 0xE0, i);
3354 }
3355 
3356 void Assembler::fsubrp(int i) {
3357   emit_farith(0xDE, 0xE0, i);                    // ST(0) <- ST(1) - ST(0) and pop (Intel manual wrong)
3358 }
3359 
3360 void Assembler::ftan() {
3361   emit_byte(0xD9);
3362   emit_byte(0xF2);
3363   emit_byte(0xDD);
3364   emit_byte(0xD8);
3365 }
3366 
3367 void Assembler::ftst() {
3368   emit_byte(0xD9);
3369   emit_byte(0xE4);
3370 }
3371 
3372 void Assembler::fucomi(int i) {
3373   // make sure the instruction is supported (introduced for P6, together with cmov)
3374   guarantee(VM_Version::supports_cmov(), "illegal instruction");
3375   emit_farith(0xDB, 0xE8, i);
3376 }
3377 
3378 void Assembler::fucomip(int i) {
3379   // make sure the instruction is supported (introduced for P6, together with cmov)
3380   guarantee(VM_Version::supports_cmov(), "illegal instruction");
3381   emit_farith(0xDF, 0xE8, i);
3382 }
3383 
3384 void Assembler::fwait() {
3385   emit_byte(0x9B);
3386 }
3387 
3388 void Assembler::fxch(int i) {
3389   emit_farith(0xD9, 0xC8, i);
3390 }
3391 
3392 void Assembler::fyl2x() {
3393   emit_byte(0xD9);
3394   emit_byte(0xF1);
3395 }
3396 
3397 
3398 #ifndef _LP64
3399 
3400 void Assembler::incl(Register dst) {
3401   // Don't use it directly. Use MacroAssembler::incrementl() instead.
3402  emit_byte(0x40 | dst->encoding());
3403 }
3404 
3405 void Assembler::lea(Register dst, Address src) {
3406   leal(dst, src);
3407 }
3408 
3409 void Assembler::mov_literal32(Address dst, int32_t imm32,  RelocationHolder const& rspec) {
3410   InstructionMark im(this);
3411   emit_byte(0xC7);
3412   emit_operand(rax, dst);
3413   emit_data((int)imm32, rspec, 0);
3414 }
3415 
3416 void Assembler::mov_literal32(Register dst, int32_t imm32, RelocationHolder const& rspec) {
3417   InstructionMark im(this);
3418   int encode = prefix_and_encode(dst->encoding());
3419   emit_byte(0xB8 | encode);
3420   emit_data((int)imm32, rspec, 0);
3421 }
3422 
3423 void Assembler::popa() { // 32bit
3424   emit_byte(0x61);
3425 }
3426 
3427 void Assembler::push_literal32(int32_t imm32, RelocationHolder const& rspec) {
3428   InstructionMark im(this);
3429   emit_byte(0x68);
3430   emit_data(imm32, rspec, 0);
3431 }
3432 
3433 void Assembler::pusha() { // 32bit
3434   emit_byte(0x60);
3435 }
3436 
3437 void Assembler::set_byte_if_not_zero(Register dst) {
3438   emit_byte(0x0F);
3439   emit_byte(0x95);
3440   emit_byte(0xE0 | dst->encoding());
3441 }
3442 
3443 void Assembler::shldl(Register dst, Register src) {
3444   emit_byte(0x0F);
3445   emit_byte(0xA5);
3446   emit_byte(0xC0 | src->encoding() << 3 | dst->encoding());
3447 }
3448 
3449 void Assembler::shrdl(Register dst, Register src) {
3450   emit_byte(0x0F);
3451   emit_byte(0xAD);
3452   emit_byte(0xC0 | src->encoding() << 3 | dst->encoding());
3453 }
3454 
3455 #else // LP64
3456 
3457 void Assembler::set_byte_if_not_zero(Register dst) {
3458   int enc = prefix_and_encode(dst->encoding(), true);
3459   emit_byte(0x0F);
3460   emit_byte(0x95);
3461   emit_byte(0xE0 | enc);
3462 }
3463 
3464 // 64bit only pieces of the assembler
3465 // This should only be used by 64bit instructions that can use rip-relative
3466 // it cannot be used by instructions that want an immediate value.
3467 
3468 bool Assembler::reachable(AddressLiteral adr) {
3469   int64_t disp;
3470   // None will force a 64bit literal to the code stream. Likely a placeholder
3471   // for something that will be patched later and we need to certain it will
3472   // always be reachable.
3473   if (adr.reloc() == relocInfo::none) {
3474     return false;
3475   }
3476   if (adr.reloc() == relocInfo::internal_word_type) {
3477     // This should be rip relative and easily reachable.
3478     return true;
3479   }
3480   if (adr.reloc() == relocInfo::virtual_call_type ||
3481       adr.reloc() == relocInfo::opt_virtual_call_type ||
3482       adr.reloc() == relocInfo::static_call_type ||
3483       adr.reloc() == relocInfo::static_stub_type ) {
3484     // This should be rip relative within the code cache and easily
3485     // reachable until we get huge code caches. (At which point
3486     // ic code is going to have issues).
3487     return true;
3488   }
3489   if (adr.reloc() != relocInfo::external_word_type &&
3490       adr.reloc() != relocInfo::poll_return_type &&  // these are really external_word but need special
3491       adr.reloc() != relocInfo::poll_type &&         // relocs to identify them
3492       adr.reloc() != relocInfo::runtime_call_type ) {
3493     return false;
3494   }
3495 
3496   // Stress the correction code
3497   if (ForceUnreachable) {
3498     // Must be runtimecall reloc, see if it is in the codecache
3499     // Flipping stuff in the codecache to be unreachable causes issues
3500     // with things like inline caches where the additional instructions
3501     // are not handled.
3502     if (CodeCache::find_blob(adr._target) == NULL) {
3503       return false;
3504     }
3505   }
3506   // For external_word_type/runtime_call_type if it is reachable from where we
3507   // are now (possibly a temp buffer) and where we might end up
3508   // anywhere in the codeCache then we are always reachable.
3509   // This would have to change if we ever save/restore shared code
3510   // to be more pessimistic.
3511   disp = (int64_t)adr._target - ((int64_t)CodeCache::low_bound() + sizeof(int));
3512   if (!is_simm32(disp)) return false;
3513   disp = (int64_t)adr._target - ((int64_t)CodeCache::high_bound() + sizeof(int));
3514   if (!is_simm32(disp)) return false;
3515 
3516   disp = (int64_t)adr._target - ((int64_t)_code_pos + sizeof(int));
3517 
3518   // Because rip relative is a disp + address_of_next_instruction and we
3519   // don't know the value of address_of_next_instruction we apply a fudge factor
3520   // to make sure we will be ok no matter the size of the instruction we get placed into.
3521   // We don't have to fudge the checks above here because they are already worst case.
3522 
3523   // 12 == override/rex byte, opcode byte, rm byte, sib byte, a 4-byte disp , 4-byte literal
3524   // + 4 because better safe than sorry.
3525   const int fudge = 12 + 4;
3526   if (disp < 0) {
3527     disp -= fudge;
3528   } else {
3529     disp += fudge;
3530   }
3531   return is_simm32(disp);
3532 }
3533 
3534 // Check if the polling page is not reachable from the code cache using rip-relative
3535 // addressing.
3536 bool Assembler::is_polling_page_far() {
3537   intptr_t addr = (intptr_t)os::get_polling_page();
3538   return !is_simm32(addr - (intptr_t)CodeCache::low_bound()) ||
3539          !is_simm32(addr - (intptr_t)CodeCache::high_bound());
3540 }
3541 
3542 void Assembler::emit_data64(jlong data,
3543                             relocInfo::relocType rtype,
3544                             int format) {
3545   if (rtype == relocInfo::none) {
3546     emit_long64(data);
3547   } else {
3548     emit_data64(data, Relocation::spec_simple(rtype), format);
3549   }
3550 }
3551 
3552 void Assembler::emit_data64(jlong data,
3553                             RelocationHolder const& rspec,
3554                             int format) {
3555   assert(imm_operand == 0, "default format must be immediate in this file");
3556   assert(imm_operand == format, "must be immediate");
3557   assert(inst_mark() != NULL, "must be inside InstructionMark");
3558   // Do not use AbstractAssembler::relocate, which is not intended for
3559   // embedded words.  Instead, relocate to the enclosing instruction.
3560   code_section()->relocate(inst_mark(), rspec, format);
3561 #ifdef ASSERT
3562   check_relocation(rspec, format);
3563 #endif
3564   emit_long64(data);
3565 }
3566 
3567 int Assembler::prefix_and_encode(int reg_enc, bool byteinst) {
3568   if (reg_enc >= 8) {
3569     prefix(REX_B);
3570     reg_enc -= 8;
3571   } else if (byteinst && reg_enc >= 4) {
3572     prefix(REX);
3573   }
3574   return reg_enc;
3575 }
3576 
3577 int Assembler::prefixq_and_encode(int reg_enc) {
3578   if (reg_enc < 8) {
3579     prefix(REX_W);
3580   } else {
3581     prefix(REX_WB);
3582     reg_enc -= 8;
3583   }
3584   return reg_enc;
3585 }
3586 
3587 int Assembler::prefix_and_encode(int dst_enc, int src_enc, bool byteinst) {
3588   if (dst_enc < 8) {
3589     if (src_enc >= 8) {
3590       prefix(REX_B);
3591       src_enc -= 8;
3592     } else if (byteinst && src_enc >= 4) {
3593       prefix(REX);
3594     }
3595   } else {
3596     if (src_enc < 8) {
3597       prefix(REX_R);
3598     } else {
3599       prefix(REX_RB);
3600       src_enc -= 8;
3601     }
3602     dst_enc -= 8;
3603   }
3604   return dst_enc << 3 | src_enc;
3605 }
3606 
3607 int Assembler::prefixq_and_encode(int dst_enc, int src_enc) {
3608   if (dst_enc < 8) {
3609     if (src_enc < 8) {
3610       prefix(REX_W);
3611     } else {
3612       prefix(REX_WB);
3613       src_enc -= 8;
3614     }
3615   } else {
3616     if (src_enc < 8) {
3617       prefix(REX_WR);
3618     } else {
3619       prefix(REX_WRB);
3620       src_enc -= 8;
3621     }
3622     dst_enc -= 8;
3623   }
3624   return dst_enc << 3 | src_enc;
3625 }
3626 
3627 void Assembler::prefix(Register reg) {
3628   if (reg->encoding() >= 8) {
3629     prefix(REX_B);
3630   }
3631 }
3632 
3633 void Assembler::prefix(Address adr) {
3634   if (adr.base_needs_rex()) {
3635     if (adr.index_needs_rex()) {
3636       prefix(REX_XB);
3637     } else {
3638       prefix(REX_B);
3639     }
3640   } else {
3641     if (adr.index_needs_rex()) {
3642       prefix(REX_X);
3643     }
3644   }
3645 }
3646 
3647 void Assembler::prefixq(Address adr) {
3648   if (adr.base_needs_rex()) {
3649     if (adr.index_needs_rex()) {
3650       prefix(REX_WXB);
3651     } else {
3652       prefix(REX_WB);
3653     }
3654   } else {
3655     if (adr.index_needs_rex()) {
3656       prefix(REX_WX);
3657     } else {
3658       prefix(REX_W);
3659     }
3660   }
3661 }
3662 
3663 
3664 void Assembler::prefix(Address adr, Register reg, bool byteinst) {
3665   if (reg->encoding() < 8) {
3666     if (adr.base_needs_rex()) {
3667       if (adr.index_needs_rex()) {
3668         prefix(REX_XB);
3669       } else {
3670         prefix(REX_B);
3671       }
3672     } else {
3673       if (adr.index_needs_rex()) {
3674         prefix(REX_X);
3675       } else if (byteinst && reg->encoding() >= 4 ) {
3676         prefix(REX);
3677       }
3678     }
3679   } else {
3680     if (adr.base_needs_rex()) {
3681       if (adr.index_needs_rex()) {
3682         prefix(REX_RXB);
3683       } else {
3684         prefix(REX_RB);
3685       }
3686     } else {
3687       if (adr.index_needs_rex()) {
3688         prefix(REX_RX);
3689       } else {
3690         prefix(REX_R);
3691       }
3692     }
3693   }
3694 }
3695 
3696 void Assembler::prefixq(Address adr, Register src) {
3697   if (src->encoding() < 8) {
3698     if (adr.base_needs_rex()) {
3699       if (adr.index_needs_rex()) {
3700         prefix(REX_WXB);
3701       } else {
3702         prefix(REX_WB);
3703       }
3704     } else {
3705       if (adr.index_needs_rex()) {
3706         prefix(REX_WX);
3707       } else {
3708         prefix(REX_W);
3709       }
3710     }
3711   } else {
3712     if (adr.base_needs_rex()) {
3713       if (adr.index_needs_rex()) {
3714         prefix(REX_WRXB);
3715       } else {
3716         prefix(REX_WRB);
3717       }
3718     } else {
3719       if (adr.index_needs_rex()) {
3720         prefix(REX_WRX);
3721       } else {
3722         prefix(REX_WR);
3723       }
3724     }
3725   }
3726 }
3727 
3728 void Assembler::prefix(Address adr, XMMRegister reg) {
3729   if (reg->encoding() < 8) {
3730     if (adr.base_needs_rex()) {
3731       if (adr.index_needs_rex()) {
3732         prefix(REX_XB);
3733       } else {
3734         prefix(REX_B);
3735       }
3736     } else {
3737       if (adr.index_needs_rex()) {
3738         prefix(REX_X);
3739       }
3740     }
3741   } else {
3742     if (adr.base_needs_rex()) {
3743       if (adr.index_needs_rex()) {
3744         prefix(REX_RXB);
3745       } else {
3746         prefix(REX_RB);
3747       }
3748     } else {
3749       if (adr.index_needs_rex()) {
3750         prefix(REX_RX);
3751       } else {
3752         prefix(REX_R);
3753       }
3754     }
3755   }
3756 }
3757 
3758 void Assembler::adcq(Register dst, int32_t imm32) {
3759   (void) prefixq_and_encode(dst->encoding());
3760   emit_arith(0x81, 0xD0, dst, imm32);
3761 }
3762 
3763 void Assembler::adcq(Register dst, Address src) {
3764   InstructionMark im(this);
3765   prefixq(src, dst);
3766   emit_byte(0x13);
3767   emit_operand(dst, src);
3768 }
3769 
3770 void Assembler::adcq(Register dst, Register src) {
3771   (int) prefixq_and_encode(dst->encoding(), src->encoding());
3772   emit_arith(0x13, 0xC0, dst, src);
3773 }
3774 
3775 void Assembler::addq(Address dst, int32_t imm32) {
3776   InstructionMark im(this);
3777   prefixq(dst);
3778   emit_arith_operand(0x81, rax, dst,imm32);
3779 }
3780 
3781 void Assembler::addq(Address dst, Register src) {
3782   InstructionMark im(this);
3783   prefixq(dst, src);
3784   emit_byte(0x01);
3785   emit_operand(src, dst);
3786 }
3787 
3788 void Assembler::addq(Register dst, int32_t imm32) {
3789   (void) prefixq_and_encode(dst->encoding());
3790   emit_arith(0x81, 0xC0, dst, imm32);
3791 }
3792 
3793 void Assembler::addq(Register dst, Address src) {
3794   InstructionMark im(this);
3795   prefixq(src, dst);
3796   emit_byte(0x03);
3797   emit_operand(dst, src);
3798 }
3799 
3800 void Assembler::addq(Register dst, Register src) {
3801   (void) prefixq_and_encode(dst->encoding(), src->encoding());
3802   emit_arith(0x03, 0xC0, dst, src);
3803 }
3804 
3805 void Assembler::andq(Address dst, int32_t imm32) {
3806   InstructionMark im(this);
3807   prefixq(dst);
3808   emit_byte(0x81);
3809   emit_operand(rsp, dst, 4);
3810   emit_long(imm32);
3811 }
3812 
3813 void Assembler::andq(Register dst, int32_t imm32) {
3814   (void) prefixq_and_encode(dst->encoding());
3815   emit_arith(0x81, 0xE0, dst, imm32);
3816 }
3817 
3818 void Assembler::andq(Register dst, Address src) {
3819   InstructionMark im(this);
3820   prefixq(src, dst);
3821   emit_byte(0x23);
3822   emit_operand(dst, src);
3823 }
3824 
3825 void Assembler::andq(Register dst, Register src) {
3826   (int) prefixq_and_encode(dst->encoding(), src->encoding());
3827   emit_arith(0x23, 0xC0, dst, src);
3828 }
3829 
3830 void Assembler::bsfq(Register dst, Register src) {
3831   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
3832   emit_byte(0x0F);
3833   emit_byte(0xBC);
3834   emit_byte(0xC0 | encode);
3835 }
3836 
3837 void Assembler::bsrq(Register dst, Register src) {
3838   assert(!VM_Version::supports_lzcnt(), "encoding is treated as LZCNT");
3839   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
3840   emit_byte(0x0F);
3841   emit_byte(0xBD);
3842   emit_byte(0xC0 | encode);
3843 }
3844 
3845 void Assembler::bswapq(Register reg) {
3846   int encode = prefixq_and_encode(reg->encoding());
3847   emit_byte(0x0F);
3848   emit_byte(0xC8 | encode);
3849 }
3850 
3851 void Assembler::cdqq() {
3852   prefix(REX_W);
3853   emit_byte(0x99);
3854 }
3855 
3856 void Assembler::clflush(Address adr) {
3857   prefix(adr);
3858   emit_byte(0x0F);
3859   emit_byte(0xAE);
3860   emit_operand(rdi, adr);
3861 }
3862 
3863 void Assembler::cmovq(Condition cc, Register dst, Register src) {
3864   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
3865   emit_byte(0x0F);
3866   emit_byte(0x40 | cc);
3867   emit_byte(0xC0 | encode);
3868 }
3869 
3870 void Assembler::cmovq(Condition cc, Register dst, Address src) {
3871   InstructionMark im(this);
3872   prefixq(src, dst);
3873   emit_byte(0x0F);
3874   emit_byte(0x40 | cc);
3875   emit_operand(dst, src);
3876 }
3877 
3878 void Assembler::cmpq(Address dst, int32_t imm32) {
3879   InstructionMark im(this);
3880   prefixq(dst);
3881   emit_byte(0x81);
3882   emit_operand(rdi, dst, 4);
3883   emit_long(imm32);
3884 }
3885 
3886 void Assembler::cmpq(Register dst, int32_t imm32) {
3887   (void) prefixq_and_encode(dst->encoding());
3888   emit_arith(0x81, 0xF8, dst, imm32);
3889 }
3890 
3891 void Assembler::cmpq(Address dst, Register src) {
3892   InstructionMark im(this);
3893   prefixq(dst, src);
3894   emit_byte(0x3B);
3895   emit_operand(src, dst);
3896 }
3897 
3898 void Assembler::cmpq(Register dst, Register src) {
3899   (void) prefixq_and_encode(dst->encoding(), src->encoding());
3900   emit_arith(0x3B, 0xC0, dst, src);
3901 }
3902 
3903 void Assembler::cmpq(Register dst, Address  src) {
3904   InstructionMark im(this);
3905   prefixq(src, dst);
3906   emit_byte(0x3B);
3907   emit_operand(dst, src);
3908 }
3909 
3910 void Assembler::cmpxchgq(Register reg, Address adr) {
3911   InstructionMark im(this);
3912   prefixq(adr, reg);
3913   emit_byte(0x0F);
3914   emit_byte(0xB1);
3915   emit_operand(reg, adr);
3916 }
3917 
3918 void Assembler::cvtsi2sdq(XMMRegister dst, Register src) {
3919   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3920   emit_byte(0xF2);
3921   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
3922   emit_byte(0x0F);
3923   emit_byte(0x2A);
3924   emit_byte(0xC0 | encode);
3925 }
3926 
3927 void Assembler::cvtsi2ssq(XMMRegister dst, Register src) {
3928   NOT_LP64(assert(VM_Version::supports_sse(), ""));
3929   emit_byte(0xF3);
3930   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
3931   emit_byte(0x0F);
3932   emit_byte(0x2A);
3933   emit_byte(0xC0 | encode);
3934 }
3935 
3936 void Assembler::cvttsd2siq(Register dst, XMMRegister src) {
3937   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3938   emit_byte(0xF2);
3939   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
3940   emit_byte(0x0F);
3941   emit_byte(0x2C);
3942   emit_byte(0xC0 | encode);
3943 }
3944 
3945 void Assembler::cvttss2siq(Register dst, XMMRegister src) {
3946   NOT_LP64(assert(VM_Version::supports_sse(), ""));
3947   emit_byte(0xF3);
3948   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
3949   emit_byte(0x0F);
3950   emit_byte(0x2C);
3951   emit_byte(0xC0 | encode);
3952 }
3953 
3954 void Assembler::decl(Register dst) {
3955   // Don't use it directly. Use MacroAssembler::decrementl() instead.
3956   // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
3957   int encode = prefix_and_encode(dst->encoding());
3958   emit_byte(0xFF);
3959   emit_byte(0xC8 | encode);
3960 }
3961 
3962 void Assembler::decq(Register dst) {
3963   // Don't use it directly. Use MacroAssembler::decrementq() instead.
3964   // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
3965   int encode = prefixq_and_encode(dst->encoding());
3966   emit_byte(0xFF);
3967   emit_byte(0xC8 | encode);
3968 }
3969 
3970 void Assembler::decq(Address dst) {
3971   // Don't use it directly. Use MacroAssembler::decrementq() instead.
3972   InstructionMark im(this);
3973   prefixq(dst);
3974   emit_byte(0xFF);
3975   emit_operand(rcx, dst);
3976 }
3977 
3978 void Assembler::fxrstor(Address src) {
3979   prefixq(src);
3980   emit_byte(0x0F);
3981   emit_byte(0xAE);
3982   emit_operand(as_Register(1), src);
3983 }
3984 
3985 void Assembler::fxsave(Address dst) {
3986   prefixq(dst);
3987   emit_byte(0x0F);
3988   emit_byte(0xAE);
3989   emit_operand(as_Register(0), dst);
3990 }
3991 
3992 void Assembler::idivq(Register src) {
3993   int encode = prefixq_and_encode(src->encoding());
3994   emit_byte(0xF7);
3995   emit_byte(0xF8 | encode);
3996 }
3997 
3998 void Assembler::imulq(Register dst, Register src) {
3999   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4000   emit_byte(0x0F);
4001   emit_byte(0xAF);
4002   emit_byte(0xC0 | encode);
4003 }
4004 
4005 void Assembler::imulq(Register dst, Register src, int value) {
4006   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4007   if (is8bit(value)) {
4008     emit_byte(0x6B);
4009     emit_byte(0xC0 | encode);
4010     emit_byte(value & 0xFF);
4011   } else {
4012     emit_byte(0x69);
4013     emit_byte(0xC0 | encode);
4014     emit_long(value);
4015   }
4016 }
4017 
4018 void Assembler::incl(Register dst) {
4019   // Don't use it directly. Use MacroAssembler::incrementl() instead.
4020   // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
4021   int encode = prefix_and_encode(dst->encoding());
4022   emit_byte(0xFF);
4023   emit_byte(0xC0 | encode);
4024 }
4025 
4026 void Assembler::incq(Register dst) {
4027   // Don't use it directly. Use MacroAssembler::incrementq() instead.
4028   // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
4029   int encode = prefixq_and_encode(dst->encoding());
4030   emit_byte(0xFF);
4031   emit_byte(0xC0 | encode);
4032 }
4033 
4034 void Assembler::incq(Address dst) {
4035   // Don't use it directly. Use MacroAssembler::incrementq() instead.
4036   InstructionMark im(this);
4037   prefixq(dst);
4038   emit_byte(0xFF);
4039   emit_operand(rax, dst);
4040 }
4041 
4042 void Assembler::lea(Register dst, Address src) {
4043   leaq(dst, src);
4044 }
4045 
4046 void Assembler::leaq(Register dst, Address src) {
4047   InstructionMark im(this);
4048   prefixq(src, dst);
4049   emit_byte(0x8D);
4050   emit_operand(dst, src);
4051 }
4052 
4053 void Assembler::mov64(Register dst, int64_t imm64) {
4054   InstructionMark im(this);
4055   int encode = prefixq_and_encode(dst->encoding());
4056   emit_byte(0xB8 | encode);
4057   emit_long64(imm64);
4058 }
4059 
4060 void Assembler::mov_literal64(Register dst, intptr_t imm64, RelocationHolder const& rspec) {
4061   InstructionMark im(this);
4062   int encode = prefixq_and_encode(dst->encoding());
4063   emit_byte(0xB8 | encode);
4064   emit_data64(imm64, rspec);
4065 }
4066 
4067 void Assembler::mov_narrow_oop(Register dst, int32_t imm32, RelocationHolder const& rspec) {
4068   InstructionMark im(this);
4069   int encode = prefix_and_encode(dst->encoding());
4070   emit_byte(0xB8 | encode);
4071   emit_data((int)imm32, rspec, narrow_oop_operand);
4072 }
4073 
4074 void Assembler::mov_narrow_oop(Address dst, int32_t imm32,  RelocationHolder const& rspec) {
4075   InstructionMark im(this);
4076   prefix(dst);
4077   emit_byte(0xC7);
4078   emit_operand(rax, dst, 4);
4079   emit_data((int)imm32, rspec, narrow_oop_operand);
4080 }
4081 
4082 void Assembler::cmp_narrow_oop(Register src1, int32_t imm32, RelocationHolder const& rspec) {
4083   InstructionMark im(this);
4084   int encode = prefix_and_encode(src1->encoding());
4085   emit_byte(0x81);
4086   emit_byte(0xF8 | encode);
4087   emit_data((int)imm32, rspec, narrow_oop_operand);
4088 }
4089 
4090 void Assembler::cmp_narrow_oop(Address src1, int32_t imm32, RelocationHolder const& rspec) {
4091   InstructionMark im(this);
4092   prefix(src1);
4093   emit_byte(0x81);
4094   emit_operand(rax, src1, 4);
4095   emit_data((int)imm32, rspec, narrow_oop_operand);
4096 }
4097 
4098 void Assembler::lzcntq(Register dst, Register src) {
4099   assert(VM_Version::supports_lzcnt(), "encoding is treated as BSR");
4100   emit_byte(0xF3);
4101   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4102   emit_byte(0x0F);
4103   emit_byte(0xBD);
4104   emit_byte(0xC0 | encode);
4105 }
4106 
4107 void Assembler::movdq(XMMRegister dst, Register src) {
4108   // table D-1 says MMX/SSE2
4109   NOT_LP64(assert(VM_Version::supports_sse2() || VM_Version::supports_mmx(), ""));
4110   emit_byte(0x66);
4111   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4112   emit_byte(0x0F);
4113   emit_byte(0x6E);
4114   emit_byte(0xC0 | encode);
4115 }
4116 
4117 void Assembler::movdq(Register dst, XMMRegister src) {
4118   // table D-1 says MMX/SSE2
4119   NOT_LP64(assert(VM_Version::supports_sse2() || VM_Version::supports_mmx(), ""));
4120   emit_byte(0x66);
4121   // swap src/dst to get correct prefix
4122   int encode = prefixq_and_encode(src->encoding(), dst->encoding());
4123   emit_byte(0x0F);
4124   emit_byte(0x7E);
4125   emit_byte(0xC0 | encode);
4126 }
4127 
4128 void Assembler::movq(Register dst, Register src) {
4129   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4130   emit_byte(0x8B);
4131   emit_byte(0xC0 | encode);
4132 }
4133 
4134 void Assembler::movq(Register dst, Address src) {
4135   InstructionMark im(this);
4136   prefixq(src, dst);
4137   emit_byte(0x8B);
4138   emit_operand(dst, src);
4139 }
4140 
4141 void Assembler::movq(Address dst, Register src) {
4142   InstructionMark im(this);
4143   prefixq(dst, src);
4144   emit_byte(0x89);
4145   emit_operand(src, dst);
4146 }
4147 
4148 void Assembler::movsbq(Register dst, Address src) {
4149   InstructionMark im(this);
4150   prefixq(src, dst);
4151   emit_byte(0x0F);
4152   emit_byte(0xBE);
4153   emit_operand(dst, src);
4154 }
4155 
4156 void Assembler::movsbq(Register dst, Register src) {
4157   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4158   emit_byte(0x0F);
4159   emit_byte(0xBE);
4160   emit_byte(0xC0 | encode);
4161 }
4162 
4163 void Assembler::movslq(Register dst, int32_t imm32) {
4164   // dbx shows movslq(rcx, 3) as movq     $0x0000000049000000,(%rbx)
4165   // and movslq(r8, 3); as movl     $0x0000000048000000,(%rbx)
4166   // as a result we shouldn't use until tested at runtime...
4167   ShouldNotReachHere();
4168   InstructionMark im(this);
4169   int encode = prefixq_and_encode(dst->encoding());
4170   emit_byte(0xC7 | encode);
4171   emit_long(imm32);
4172 }
4173 
4174 void Assembler::movslq(Address dst, int32_t imm32) {
4175   assert(is_simm32(imm32), "lost bits");
4176   InstructionMark im(this);
4177   prefixq(dst);
4178   emit_byte(0xC7);
4179   emit_operand(rax, dst, 4);
4180   emit_long(imm32);
4181 }
4182 
4183 void Assembler::movslq(Register dst, Address src) {
4184   InstructionMark im(this);
4185   prefixq(src, dst);
4186   emit_byte(0x63);
4187   emit_operand(dst, src);
4188 }
4189 
4190 void Assembler::movslq(Register dst, Register src) {
4191   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4192   emit_byte(0x63);
4193   emit_byte(0xC0 | encode);
4194 }
4195 
4196 void Assembler::movswq(Register dst, Address src) {
4197   InstructionMark im(this);
4198   prefixq(src, dst);
4199   emit_byte(0x0F);
4200   emit_byte(0xBF);
4201   emit_operand(dst, src);
4202 }
4203 
4204 void Assembler::movswq(Register dst, Register src) {
4205   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4206   emit_byte(0x0F);
4207   emit_byte(0xBF);
4208   emit_byte(0xC0 | encode);
4209 }
4210 
4211 void Assembler::movzbq(Register dst, Address src) {
4212   InstructionMark im(this);
4213   prefixq(src, dst);
4214   emit_byte(0x0F);
4215   emit_byte(0xB6);
4216   emit_operand(dst, src);
4217 }
4218 
4219 void Assembler::movzbq(Register dst, Register src) {
4220   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4221   emit_byte(0x0F);
4222   emit_byte(0xB6);
4223   emit_byte(0xC0 | encode);
4224 }
4225 
4226 void Assembler::movzwq(Register dst, Address src) {
4227   InstructionMark im(this);
4228   prefixq(src, dst);
4229   emit_byte(0x0F);
4230   emit_byte(0xB7);
4231   emit_operand(dst, src);
4232 }
4233 
4234 void Assembler::movzwq(Register dst, Register src) {
4235   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4236   emit_byte(0x0F);
4237   emit_byte(0xB7);
4238   emit_byte(0xC0 | encode);
4239 }
4240 
4241 void Assembler::negq(Register dst) {
4242   int encode = prefixq_and_encode(dst->encoding());
4243   emit_byte(0xF7);
4244   emit_byte(0xD8 | encode);
4245 }
4246 
4247 void Assembler::notq(Register dst) {
4248   int encode = prefixq_and_encode(dst->encoding());
4249   emit_byte(0xF7);
4250   emit_byte(0xD0 | encode);
4251 }
4252 
4253 void Assembler::orq(Address dst, int32_t imm32) {
4254   InstructionMark im(this);
4255   prefixq(dst);
4256   emit_byte(0x81);
4257   emit_operand(rcx, dst, 4);
4258   emit_long(imm32);
4259 }
4260 
4261 void Assembler::orq(Register dst, int32_t imm32) {
4262   (void) prefixq_and_encode(dst->encoding());
4263   emit_arith(0x81, 0xC8, dst, imm32);
4264 }
4265 
4266 void Assembler::orq(Register dst, Address src) {
4267   InstructionMark im(this);
4268   prefixq(src, dst);
4269   emit_byte(0x0B);
4270   emit_operand(dst, src);
4271 }
4272 
4273 void Assembler::orq(Register dst, Register src) {
4274   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4275   emit_arith(0x0B, 0xC0, dst, src);
4276 }
4277 
4278 void Assembler::popa() { // 64bit
4279   movq(r15, Address(rsp, 0));
4280   movq(r14, Address(rsp, wordSize));
4281   movq(r13, Address(rsp, 2 * wordSize));
4282   movq(r12, Address(rsp, 3 * wordSize));
4283   movq(r11, Address(rsp, 4 * wordSize));
4284   movq(r10, Address(rsp, 5 * wordSize));
4285   movq(r9,  Address(rsp, 6 * wordSize));
4286   movq(r8,  Address(rsp, 7 * wordSize));
4287   movq(rdi, Address(rsp, 8 * wordSize));
4288   movq(rsi, Address(rsp, 9 * wordSize));
4289   movq(rbp, Address(rsp, 10 * wordSize));
4290   // skip rsp
4291   movq(rbx, Address(rsp, 12 * wordSize));
4292   movq(rdx, Address(rsp, 13 * wordSize));
4293   movq(rcx, Address(rsp, 14 * wordSize));
4294   movq(rax, Address(rsp, 15 * wordSize));
4295 
4296   addq(rsp, 16 * wordSize);
4297 }
4298 
4299 void Assembler::popcntq(Register dst, Address src) {
4300   assert(VM_Version::supports_popcnt(), "must support");
4301   InstructionMark im(this);
4302   emit_byte(0xF3);
4303   prefixq(src, dst);
4304   emit_byte(0x0F);
4305   emit_byte(0xB8);
4306   emit_operand(dst, src);
4307 }
4308 
4309 void Assembler::popcntq(Register dst, Register src) {
4310   assert(VM_Version::supports_popcnt(), "must support");
4311   emit_byte(0xF3);
4312   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4313   emit_byte(0x0F);
4314   emit_byte(0xB8);
4315   emit_byte(0xC0 | encode);
4316 }
4317 
4318 void Assembler::popq(Address dst) {
4319   InstructionMark im(this);
4320   prefixq(dst);
4321   emit_byte(0x8F);
4322   emit_operand(rax, dst);
4323 }
4324 
4325 void Assembler::pusha() { // 64bit
4326   // we have to store original rsp.  ABI says that 128 bytes
4327   // below rsp are local scratch.
4328   movq(Address(rsp, -5 * wordSize), rsp);
4329 
4330   subq(rsp, 16 * wordSize);
4331 
4332   movq(Address(rsp, 15 * wordSize), rax);
4333   movq(Address(rsp, 14 * wordSize), rcx);
4334   movq(Address(rsp, 13 * wordSize), rdx);
4335   movq(Address(rsp, 12 * wordSize), rbx);
4336   // skip rsp
4337   movq(Address(rsp, 10 * wordSize), rbp);
4338   movq(Address(rsp, 9 * wordSize), rsi);
4339   movq(Address(rsp, 8 * wordSize), rdi);
4340   movq(Address(rsp, 7 * wordSize), r8);
4341   movq(Address(rsp, 6 * wordSize), r9);
4342   movq(Address(rsp, 5 * wordSize), r10);
4343   movq(Address(rsp, 4 * wordSize), r11);
4344   movq(Address(rsp, 3 * wordSize), r12);
4345   movq(Address(rsp, 2 * wordSize), r13);
4346   movq(Address(rsp, wordSize), r14);
4347   movq(Address(rsp, 0), r15);
4348 }
4349 
4350 void Assembler::pushq(Address src) {
4351   InstructionMark im(this);
4352   prefixq(src);
4353   emit_byte(0xFF);
4354   emit_operand(rsi, src);
4355 }
4356 
4357 void Assembler::rclq(Register dst, int imm8) {
4358   assert(isShiftCount(imm8 >> 1), "illegal shift count");
4359   int encode = prefixq_and_encode(dst->encoding());
4360   if (imm8 == 1) {
4361     emit_byte(0xD1);
4362     emit_byte(0xD0 | encode);
4363   } else {
4364     emit_byte(0xC1);
4365     emit_byte(0xD0 | encode);
4366     emit_byte(imm8);
4367   }
4368 }
4369 void Assembler::sarq(Register dst, int imm8) {
4370   assert(isShiftCount(imm8 >> 1), "illegal shift count");
4371   int encode = prefixq_and_encode(dst->encoding());
4372   if (imm8 == 1) {
4373     emit_byte(0xD1);
4374     emit_byte(0xF8 | encode);
4375   } else {
4376     emit_byte(0xC1);
4377     emit_byte(0xF8 | encode);
4378     emit_byte(imm8);
4379   }
4380 }
4381 
4382 void Assembler::sarq(Register dst) {
4383   int encode = prefixq_and_encode(dst->encoding());
4384   emit_byte(0xD3);
4385   emit_byte(0xF8 | encode);
4386 }
4387 
4388 void Assembler::sbbq(Address dst, int32_t imm32) {
4389   InstructionMark im(this);
4390   prefixq(dst);
4391   emit_arith_operand(0x81, rbx, dst, imm32);
4392 }
4393 
4394 void Assembler::sbbq(Register dst, int32_t imm32) {
4395   (void) prefixq_and_encode(dst->encoding());
4396   emit_arith(0x81, 0xD8, dst, imm32);
4397 }
4398 
4399 void Assembler::sbbq(Register dst, Address src) {
4400   InstructionMark im(this);
4401   prefixq(src, dst);
4402   emit_byte(0x1B);
4403   emit_operand(dst, src);
4404 }
4405 
4406 void Assembler::sbbq(Register dst, Register src) {
4407   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4408   emit_arith(0x1B, 0xC0, dst, src);
4409 }
4410 
4411 void Assembler::shlq(Register dst, int imm8) {
4412   assert(isShiftCount(imm8 >> 1), "illegal shift count");
4413   int encode = prefixq_and_encode(dst->encoding());
4414   if (imm8 == 1) {
4415     emit_byte(0xD1);
4416     emit_byte(0xE0 | encode);
4417   } else {
4418     emit_byte(0xC1);
4419     emit_byte(0xE0 | encode);
4420     emit_byte(imm8);
4421   }
4422 }
4423 
4424 void Assembler::shlq(Register dst) {
4425   int encode = prefixq_and_encode(dst->encoding());
4426   emit_byte(0xD3);
4427   emit_byte(0xE0 | encode);
4428 }
4429 
4430 void Assembler::shrq(Register dst, int imm8) {
4431   assert(isShiftCount(imm8 >> 1), "illegal shift count");
4432   int encode = prefixq_and_encode(dst->encoding());
4433   emit_byte(0xC1);
4434   emit_byte(0xE8 | encode);
4435   emit_byte(imm8);
4436 }
4437 
4438 void Assembler::shrq(Register dst) {
4439   int encode = prefixq_and_encode(dst->encoding());
4440   emit_byte(0xD3);
4441   emit_byte(0xE8 | encode);
4442 }
4443 
4444 void Assembler::subq(Address dst, int32_t imm32) {
4445   InstructionMark im(this);
4446   prefixq(dst);
4447   emit_arith_operand(0x81, rbp, dst, imm32);
4448 }
4449 
4450 void Assembler::subq(Address dst, Register src) {
4451   InstructionMark im(this);
4452   prefixq(dst, src);
4453   emit_byte(0x29);
4454   emit_operand(src, dst);
4455 }
4456 
4457 void Assembler::subq(Register dst, int32_t imm32) {
4458   (void) prefixq_and_encode(dst->encoding());
4459   emit_arith(0x81, 0xE8, dst, imm32);
4460 }
4461 
4462 void Assembler::subq(Register dst, Address src) {
4463   InstructionMark im(this);
4464   prefixq(src, dst);
4465   emit_byte(0x2B);
4466   emit_operand(dst, src);
4467 }
4468 
4469 void Assembler::subq(Register dst, Register src) {
4470   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4471   emit_arith(0x2B, 0xC0, dst, src);
4472 }
4473 
4474 void Assembler::testq(Register dst, int32_t imm32) {
4475   // not using emit_arith because test
4476   // doesn't support sign-extension of
4477   // 8bit operands
4478   int encode = dst->encoding();
4479   if (encode == 0) {
4480     prefix(REX_W);
4481     emit_byte(0xA9);
4482   } else {
4483     encode = prefixq_and_encode(encode);
4484     emit_byte(0xF7);
4485     emit_byte(0xC0 | encode);
4486   }
4487   emit_long(imm32);
4488 }
4489 
4490 void Assembler::testq(Register dst, Register src) {
4491   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4492   emit_arith(0x85, 0xC0, dst, src);
4493 }
4494 
4495 void Assembler::xaddq(Address dst, Register src) {
4496   InstructionMark im(this);
4497   prefixq(dst, src);
4498   emit_byte(0x0F);
4499   emit_byte(0xC1);
4500   emit_operand(src, dst);
4501 }
4502 
4503 void Assembler::xchgq(Register dst, Address src) {
4504   InstructionMark im(this);
4505   prefixq(src, dst);
4506   emit_byte(0x87);
4507   emit_operand(dst, src);
4508 }
4509 
4510 void Assembler::xchgq(Register dst, Register src) {
4511   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4512   emit_byte(0x87);
4513   emit_byte(0xc0 | encode);
4514 }
4515 
4516 void Assembler::xorq(Register dst, Register src) {
4517   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4518   emit_arith(0x33, 0xC0, dst, src);
4519 }
4520 
4521 void Assembler::xorq(Register dst, Address src) {
4522   InstructionMark im(this);
4523   prefixq(src, dst);
4524   emit_byte(0x33);
4525   emit_operand(dst, src);
4526 }
4527 
4528 #endif // !LP64
4529 
4530 static Assembler::Condition reverse[] = {
4531     Assembler::noOverflow     /* overflow      = 0x0 */ ,
4532     Assembler::overflow       /* noOverflow    = 0x1 */ ,
4533     Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
4534     Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
4535     Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
4536     Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
4537     Assembler::above          /* belowEqual    = 0x6 */ ,
4538     Assembler::belowEqual     /* above         = 0x7 */ ,
4539     Assembler::positive       /* negative      = 0x8 */ ,
4540     Assembler::negative       /* positive      = 0x9 */ ,
4541     Assembler::noParity       /* parity        = 0xa */ ,
4542     Assembler::parity         /* noParity      = 0xb */ ,
4543     Assembler::greaterEqual   /* less          = 0xc */ ,
4544     Assembler::less           /* greaterEqual  = 0xd */ ,
4545     Assembler::greater        /* lessEqual     = 0xe */ ,
4546     Assembler::lessEqual      /* greater       = 0xf, */
4547 
4548 };
4549 
4550 
4551 // Implementation of MacroAssembler
4552 
4553 // First all the versions that have distinct versions depending on 32/64 bit
4554 // Unless the difference is trivial (1 line or so).
4555 
4556 #ifndef _LP64
4557 
4558 // 32bit versions
4559 
4560 Address MacroAssembler::as_Address(AddressLiteral adr) {
4561   return Address(adr.target(), adr.rspec());
4562 }
4563 
4564 Address MacroAssembler::as_Address(ArrayAddress adr) {
4565   return Address::make_array(adr);
4566 }
4567 
4568 int MacroAssembler::biased_locking_enter(Register lock_reg,
4569                                          Register obj_reg,
4570                                          Register swap_reg,
4571                                          Register tmp_reg,
4572                                          bool swap_reg_contains_mark,
4573                                          Label& done,
4574                                          Label* slow_case,
4575                                          BiasedLockingCounters* counters) {
4576   assert(UseBiasedLocking, "why call this otherwise?");
4577   assert(swap_reg == rax, "swap_reg must be rax, for cmpxchg");
4578   assert_different_registers(lock_reg, obj_reg, swap_reg);
4579 
4580   if (PrintBiasedLockingStatistics && counters == NULL)
4581     counters = BiasedLocking::counters();
4582 
4583   bool need_tmp_reg = false;
4584   if (tmp_reg == noreg) {
4585     need_tmp_reg = true;
4586     tmp_reg = lock_reg;
4587   } else {
4588     assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
4589   }
4590   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
4591   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
4592   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
4593   Address saved_mark_addr(lock_reg, 0);
4594 
4595   // Biased locking
4596   // See whether the lock is currently biased toward our thread and
4597   // whether the epoch is still valid
4598   // Note that the runtime guarantees sufficient alignment of JavaThread
4599   // pointers to allow age to be placed into low bits
4600   // First check to see whether biasing is even enabled for this object
4601   Label cas_label;
4602   int null_check_offset = -1;
4603   if (!swap_reg_contains_mark) {
4604     null_check_offset = offset();
4605     movl(swap_reg, mark_addr);
4606   }
4607   if (need_tmp_reg) {
4608     push(tmp_reg);
4609   }
4610   movl(tmp_reg, swap_reg);
4611   andl(tmp_reg, markOopDesc::biased_lock_mask_in_place);
4612   cmpl(tmp_reg, markOopDesc::biased_lock_pattern);
4613   if (need_tmp_reg) {
4614     pop(tmp_reg);
4615   }
4616   jcc(Assembler::notEqual, cas_label);
4617   // The bias pattern is present in the object's header. Need to check
4618   // whether the bias owner and the epoch are both still current.
4619   // Note that because there is no current thread register on x86 we
4620   // need to store off the mark word we read out of the object to
4621   // avoid reloading it and needing to recheck invariants below. This
4622   // store is unfortunate but it makes the overall code shorter and
4623   // simpler.
4624   movl(saved_mark_addr, swap_reg);
4625   if (need_tmp_reg) {
4626     push(tmp_reg);
4627   }
4628   get_thread(tmp_reg);
4629   xorl(swap_reg, tmp_reg);
4630   if (swap_reg_contains_mark) {
4631     null_check_offset = offset();
4632   }
4633   movl(tmp_reg, klass_addr);
4634   xorl(swap_reg, Address(tmp_reg, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
4635   andl(swap_reg, ~((int) markOopDesc::age_mask_in_place));
4636   if (need_tmp_reg) {
4637     pop(tmp_reg);
4638   }
4639   if (counters != NULL) {
4640     cond_inc32(Assembler::zero,
4641                ExternalAddress((address)counters->biased_lock_entry_count_addr()));
4642   }
4643   jcc(Assembler::equal, done);
4644 
4645   Label try_revoke_bias;
4646   Label try_rebias;
4647 
4648   // At this point we know that the header has the bias pattern and
4649   // that we are not the bias owner in the current epoch. We need to
4650   // figure out more details about the state of the header in order to
4651   // know what operations can be legally performed on the object's
4652   // header.
4653 
4654   // If the low three bits in the xor result aren't clear, that means
4655   // the prototype header is no longer biased and we have to revoke
4656   // the bias on this object.
4657   testl(swap_reg, markOopDesc::biased_lock_mask_in_place);
4658   jcc(Assembler::notZero, try_revoke_bias);
4659 
4660   // Biasing is still enabled for this data type. See whether the
4661   // epoch of the current bias is still valid, meaning that the epoch
4662   // bits of the mark word are equal to the epoch bits of the
4663   // prototype header. (Note that the prototype header's epoch bits
4664   // only change at a safepoint.) If not, attempt to rebias the object
4665   // toward the current thread. Note that we must be absolutely sure
4666   // that the current epoch is invalid in order to do this because
4667   // otherwise the manipulations it performs on the mark word are
4668   // illegal.
4669   testl(swap_reg, markOopDesc::epoch_mask_in_place);
4670   jcc(Assembler::notZero, try_rebias);
4671 
4672   // The epoch of the current bias is still valid but we know nothing
4673   // about the owner; it might be set or it might be clear. Try to
4674   // acquire the bias of the object using an atomic operation. If this
4675   // fails we will go in to the runtime to revoke the object's bias.
4676   // Note that we first construct the presumed unbiased header so we
4677   // don't accidentally blow away another thread's valid bias.
4678   movl(swap_reg, saved_mark_addr);
4679   andl(swap_reg,
4680        markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
4681   if (need_tmp_reg) {
4682     push(tmp_reg);
4683   }
4684   get_thread(tmp_reg);
4685   orl(tmp_reg, swap_reg);
4686   if (os::is_MP()) {
4687     lock();
4688   }
4689   cmpxchgptr(tmp_reg, Address(obj_reg, 0));
4690   if (need_tmp_reg) {
4691     pop(tmp_reg);
4692   }
4693   // If the biasing toward our thread failed, this means that
4694   // another thread succeeded in biasing it toward itself and we
4695   // need to revoke that bias. The revocation will occur in the
4696   // interpreter runtime in the slow case.
4697   if (counters != NULL) {
4698     cond_inc32(Assembler::zero,
4699                ExternalAddress((address)counters->anonymously_biased_lock_entry_count_addr()));
4700   }
4701   if (slow_case != NULL) {
4702     jcc(Assembler::notZero, *slow_case);
4703   }
4704   jmp(done);
4705 
4706   bind(try_rebias);
4707   // At this point we know the epoch has expired, meaning that the
4708   // current "bias owner", if any, is actually invalid. Under these
4709   // circumstances _only_, we are allowed to use the current header's
4710   // value as the comparison value when doing the cas to acquire the
4711   // bias in the current epoch. In other words, we allow transfer of
4712   // the bias from one thread to another directly in this situation.
4713   //
4714   // FIXME: due to a lack of registers we currently blow away the age
4715   // bits in this situation. Should attempt to preserve them.
4716   if (need_tmp_reg) {
4717     push(tmp_reg);
4718   }
4719   get_thread(tmp_reg);
4720   movl(swap_reg, klass_addr);
4721   orl(tmp_reg, Address(swap_reg, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
4722   movl(swap_reg, saved_mark_addr);
4723   if (os::is_MP()) {
4724     lock();
4725   }
4726   cmpxchgptr(tmp_reg, Address(obj_reg, 0));
4727   if (need_tmp_reg) {
4728     pop(tmp_reg);
4729   }
4730   // If the biasing toward our thread failed, then another thread
4731   // succeeded in biasing it toward itself and we need to revoke that
4732   // bias. The revocation will occur in the runtime in the slow case.
4733   if (counters != NULL) {
4734     cond_inc32(Assembler::zero,
4735                ExternalAddress((address)counters->rebiased_lock_entry_count_addr()));
4736   }
4737   if (slow_case != NULL) {
4738     jcc(Assembler::notZero, *slow_case);
4739   }
4740   jmp(done);
4741 
4742   bind(try_revoke_bias);
4743   // The prototype mark in the klass doesn't have the bias bit set any
4744   // more, indicating that objects of this data type are not supposed
4745   // to be biased any more. We are going to try to reset the mark of
4746   // this object to the prototype value and fall through to the
4747   // CAS-based locking scheme. Note that if our CAS fails, it means
4748   // that another thread raced us for the privilege of revoking the
4749   // bias of this particular object, so it's okay to continue in the
4750   // normal locking code.
4751   //
4752   // FIXME: due to a lack of registers we currently blow away the age
4753   // bits in this situation. Should attempt to preserve them.
4754   movl(swap_reg, saved_mark_addr);
4755   if (need_tmp_reg) {
4756     push(tmp_reg);
4757   }
4758   movl(tmp_reg, klass_addr);
4759   movl(tmp_reg, Address(tmp_reg, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
4760   if (os::is_MP()) {
4761     lock();
4762   }
4763   cmpxchgptr(tmp_reg, Address(obj_reg, 0));
4764   if (need_tmp_reg) {
4765     pop(tmp_reg);
4766   }
4767   // Fall through to the normal CAS-based lock, because no matter what
4768   // the result of the above CAS, some thread must have succeeded in
4769   // removing the bias bit from the object's header.
4770   if (counters != NULL) {
4771     cond_inc32(Assembler::zero,
4772                ExternalAddress((address)counters->revoked_lock_entry_count_addr()));
4773   }
4774 
4775   bind(cas_label);
4776 
4777   return null_check_offset;
4778 }
4779 void MacroAssembler::call_VM_leaf_base(address entry_point,
4780                                        int number_of_arguments) {
4781   call(RuntimeAddress(entry_point));
4782   increment(rsp, number_of_arguments * wordSize);
4783 }
4784 
4785 void MacroAssembler::cmpoop(Address src1, jobject obj) {
4786   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
4787 }
4788 
4789 void MacroAssembler::cmpoop(Register src1, jobject obj) {
4790   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
4791 }
4792 
4793 void MacroAssembler::extend_sign(Register hi, Register lo) {
4794   // According to Intel Doc. AP-526, "Integer Divide", p.18.
4795   if (VM_Version::is_P6() && hi == rdx && lo == rax) {
4796     cdql();
4797   } else {
4798     movl(hi, lo);
4799     sarl(hi, 31);
4800   }
4801 }
4802 
4803 void MacroAssembler::fat_nop() {
4804   // A 5 byte nop that is safe for patching (see patch_verified_entry)
4805   emit_byte(0x26); // es:
4806   emit_byte(0x2e); // cs:
4807   emit_byte(0x64); // fs:
4808   emit_byte(0x65); // gs:
4809   emit_byte(0x90);
4810 }
4811 
4812 void MacroAssembler::jC2(Register tmp, Label& L) {
4813   // set parity bit if FPU flag C2 is set (via rax)
4814   save_rax(tmp);
4815   fwait(); fnstsw_ax();
4816   sahf();
4817   restore_rax(tmp);
4818   // branch
4819   jcc(Assembler::parity, L);
4820 }
4821 
4822 void MacroAssembler::jnC2(Register tmp, Label& L) {
4823   // set parity bit if FPU flag C2 is set (via rax)
4824   save_rax(tmp);
4825   fwait(); fnstsw_ax();
4826   sahf();
4827   restore_rax(tmp);
4828   // branch
4829   jcc(Assembler::noParity, L);
4830 }
4831 
4832 // 32bit can do a case table jump in one instruction but we no longer allow the base
4833 // to be installed in the Address class
4834 void MacroAssembler::jump(ArrayAddress entry) {
4835   jmp(as_Address(entry));
4836 }
4837 
4838 // Note: y_lo will be destroyed
4839 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
4840   // Long compare for Java (semantics as described in JVM spec.)
4841   Label high, low, done;
4842 
4843   cmpl(x_hi, y_hi);
4844   jcc(Assembler::less, low);
4845   jcc(Assembler::greater, high);
4846   // x_hi is the return register
4847   xorl(x_hi, x_hi);
4848   cmpl(x_lo, y_lo);
4849   jcc(Assembler::below, low);
4850   jcc(Assembler::equal, done);
4851 
4852   bind(high);
4853   xorl(x_hi, x_hi);
4854   increment(x_hi);
4855   jmp(done);
4856 
4857   bind(low);
4858   xorl(x_hi, x_hi);
4859   decrementl(x_hi);
4860 
4861   bind(done);
4862 }
4863 
4864 void MacroAssembler::lea(Register dst, AddressLiteral src) {
4865     mov_literal32(dst, (int32_t)src.target(), src.rspec());
4866 }
4867 
4868 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
4869   // leal(dst, as_Address(adr));
4870   // see note in movl as to why we must use a move
4871   mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
4872 }
4873 
4874 void MacroAssembler::leave() {
4875   mov(rsp, rbp);
4876   pop(rbp);
4877 }
4878 
4879 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
4880   // Multiplication of two Java long values stored on the stack
4881   // as illustrated below. Result is in rdx:rax.
4882   //
4883   // rsp ---> [  ??  ] \               \
4884   //            ....    | y_rsp_offset  |
4885   //          [ y_lo ] /  (in bytes)    | x_rsp_offset
4886   //          [ y_hi ]                  | (in bytes)
4887   //            ....                    |
4888   //          [ x_lo ]                 /
4889   //          [ x_hi ]
4890   //            ....
4891   //
4892   // Basic idea: lo(result) = lo(x_lo * y_lo)
4893   //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
4894   Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
4895   Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
4896   Label quick;
4897   // load x_hi, y_hi and check if quick
4898   // multiplication is possible
4899   movl(rbx, x_hi);
4900   movl(rcx, y_hi);
4901   movl(rax, rbx);
4902   orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
4903   jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
4904   // do full multiplication
4905   // 1st step
4906   mull(y_lo);                                    // x_hi * y_lo
4907   movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
4908   // 2nd step
4909   movl(rax, x_lo);
4910   mull(rcx);                                     // x_lo * y_hi
4911   addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
4912   // 3rd step
4913   bind(quick);                                   // note: rbx, = 0 if quick multiply!
4914   movl(rax, x_lo);
4915   mull(y_lo);                                    // x_lo * y_lo
4916   addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
4917 }
4918 
4919 void MacroAssembler::lneg(Register hi, Register lo) {
4920   negl(lo);
4921   adcl(hi, 0);
4922   negl(hi);
4923 }
4924 
4925 void MacroAssembler::lshl(Register hi, Register lo) {
4926   // Java shift left long support (semantics as described in JVM spec., p.305)
4927   // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
4928   // shift value is in rcx !
4929   assert(hi != rcx, "must not use rcx");
4930   assert(lo != rcx, "must not use rcx");
4931   const Register s = rcx;                        // shift count
4932   const int      n = BitsPerWord;
4933   Label L;
4934   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
4935   cmpl(s, n);                                    // if (s < n)
4936   jcc(Assembler::less, L);                       // else (s >= n)
4937   movl(hi, lo);                                  // x := x << n
4938   xorl(lo, lo);
4939   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
4940   bind(L);                                       // s (mod n) < n
4941   shldl(hi, lo);                                 // x := x << s
4942   shll(lo);
4943 }
4944 
4945 
4946 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
4947   // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
4948   // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
4949   assert(hi != rcx, "must not use rcx");
4950   assert(lo != rcx, "must not use rcx");
4951   const Register s = rcx;                        // shift count
4952   const int      n = BitsPerWord;
4953   Label L;
4954   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
4955   cmpl(s, n);                                    // if (s < n)
4956   jcc(Assembler::less, L);                       // else (s >= n)
4957   movl(lo, hi);                                  // x := x >> n
4958   if (sign_extension) sarl(hi, 31);
4959   else                xorl(hi, hi);
4960   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
4961   bind(L);                                       // s (mod n) < n
4962   shrdl(lo, hi);                                 // x := x >> s
4963   if (sign_extension) sarl(hi);
4964   else                shrl(hi);
4965 }
4966 
4967 void MacroAssembler::movoop(Register dst, jobject obj) {
4968   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
4969 }
4970 
4971 void MacroAssembler::movoop(Address dst, jobject obj) {
4972   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
4973 }
4974 
4975 void MacroAssembler::movptr(Register dst, AddressLiteral src) {
4976   if (src.is_lval()) {
4977     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
4978   } else {
4979     movl(dst, as_Address(src));
4980   }
4981 }
4982 
4983 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
4984   movl(as_Address(dst), src);
4985 }
4986 
4987 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
4988   movl(dst, as_Address(src));
4989 }
4990 
4991 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
4992 void MacroAssembler::movptr(Address dst, intptr_t src) {
4993   movl(dst, src);
4994 }
4995 
4996 
4997 void MacroAssembler::pop_callee_saved_registers() {
4998   pop(rcx);
4999   pop(rdx);
5000   pop(rdi);
5001   pop(rsi);
5002 }
5003 
5004 void MacroAssembler::pop_fTOS() {
5005   fld_d(Address(rsp, 0));
5006   addl(rsp, 2 * wordSize);
5007 }
5008 
5009 void MacroAssembler::push_callee_saved_registers() {
5010   push(rsi);
5011   push(rdi);
5012   push(rdx);
5013   push(rcx);
5014 }
5015 
5016 void MacroAssembler::push_fTOS() {
5017   subl(rsp, 2 * wordSize);
5018   fstp_d(Address(rsp, 0));
5019 }
5020 
5021 
5022 void MacroAssembler::pushoop(jobject obj) {
5023   push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
5024 }
5025 
5026 
5027 void MacroAssembler::pushptr(AddressLiteral src) {
5028   if (src.is_lval()) {
5029     push_literal32((int32_t)src.target(), src.rspec());
5030   } else {
5031     pushl(as_Address(src));
5032   }
5033 }
5034 
5035 void MacroAssembler::set_word_if_not_zero(Register dst) {
5036   xorl(dst, dst);
5037   set_byte_if_not_zero(dst);
5038 }
5039 
5040 static void pass_arg0(MacroAssembler* masm, Register arg) {
5041   masm->push(arg);
5042 }
5043 
5044 static void pass_arg1(MacroAssembler* masm, Register arg) {
5045   masm->push(arg);
5046 }
5047 
5048 static void pass_arg2(MacroAssembler* masm, Register arg) {
5049   masm->push(arg);
5050 }
5051 
5052 static void pass_arg3(MacroAssembler* masm, Register arg) {
5053   masm->push(arg);
5054 }
5055 
5056 #ifndef PRODUCT
5057 extern "C" void findpc(intptr_t x);
5058 #endif
5059 
5060 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
5061   // In order to get locks to work, we need to fake a in_VM state
5062   JavaThread* thread = JavaThread::current();
5063   JavaThreadState saved_state = thread->thread_state();
5064   thread->set_thread_state(_thread_in_vm);
5065   if (ShowMessageBoxOnError) {
5066     JavaThread* thread = JavaThread::current();
5067     JavaThreadState saved_state = thread->thread_state();
5068     thread->set_thread_state(_thread_in_vm);
5069     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
5070       ttyLocker ttyl;
5071       BytecodeCounter::print();
5072     }
5073     // To see where a verify_oop failed, get $ebx+40/X for this frame.
5074     // This is the value of eip which points to where verify_oop will return.
5075     if (os::message_box(msg, "Execution stopped, print registers?")) {
5076       ttyLocker ttyl;
5077       tty->print_cr("eip = 0x%08x", eip);
5078 #ifndef PRODUCT
5079       if ((WizardMode || Verbose) && PrintMiscellaneous) {
5080         tty->cr();
5081         findpc(eip);
5082         tty->cr();
5083       }
5084 #endif
5085       tty->print_cr("rax = 0x%08x", rax);
5086       tty->print_cr("rbx = 0x%08x", rbx);
5087       tty->print_cr("rcx = 0x%08x", rcx);
5088       tty->print_cr("rdx = 0x%08x", rdx);
5089       tty->print_cr("rdi = 0x%08x", rdi);
5090       tty->print_cr("rsi = 0x%08x", rsi);
5091       tty->print_cr("rbp = 0x%08x", rbp);
5092       tty->print_cr("rsp = 0x%08x", rsp);
5093       BREAKPOINT;
5094       assert(false, "start up GDB");
5095     }
5096   } else {
5097     ttyLocker ttyl;
5098     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
5099     assert(false, err_msg("DEBUG MESSAGE: %s", msg));
5100   }
5101   ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
5102 }
5103 
5104 void MacroAssembler::stop(const char* msg) {
5105   ExternalAddress message((address)msg);
5106   // push address of message
5107   pushptr(message.addr());
5108   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
5109   pusha();                                           // push registers
5110   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
5111   hlt();
5112 }
5113 
5114 void MacroAssembler::warn(const char* msg) {
5115   push_CPU_state();
5116 
5117   ExternalAddress message((address) msg);
5118   // push address of message
5119   pushptr(message.addr());
5120 
5121   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
5122   addl(rsp, wordSize);       // discard argument
5123   pop_CPU_state();
5124 }
5125 
5126 #else // _LP64
5127 
5128 // 64 bit versions
5129 
5130 Address MacroAssembler::as_Address(AddressLiteral adr) {
5131   // amd64 always does this as a pc-rel
5132   // we can be absolute or disp based on the instruction type
5133   // jmp/call are displacements others are absolute
5134   assert(!adr.is_lval(), "must be rval");
5135   assert(reachable(adr), "must be");
5136   return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
5137 
5138 }
5139 
5140 Address MacroAssembler::as_Address(ArrayAddress adr) {
5141   AddressLiteral base = adr.base();
5142   lea(rscratch1, base);
5143   Address index = adr.index();
5144   assert(index._disp == 0, "must not have disp"); // maybe it can?
5145   Address array(rscratch1, index._index, index._scale, index._disp);
5146   return array;
5147 }
5148 
5149 int MacroAssembler::biased_locking_enter(Register lock_reg,
5150                                          Register obj_reg,
5151                                          Register swap_reg,
5152                                          Register tmp_reg,
5153                                          bool swap_reg_contains_mark,
5154                                          Label& done,
5155                                          Label* slow_case,
5156                                          BiasedLockingCounters* counters) {
5157   assert(UseBiasedLocking, "why call this otherwise?");
5158   assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
5159   assert(tmp_reg != noreg, "tmp_reg must be supplied");
5160   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
5161   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
5162   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
5163   Address saved_mark_addr(lock_reg, 0);
5164 
5165   if (PrintBiasedLockingStatistics && counters == NULL)
5166     counters = BiasedLocking::counters();
5167 
5168   // Biased locking
5169   // See whether the lock is currently biased toward our thread and
5170   // whether the epoch is still valid
5171   // Note that the runtime guarantees sufficient alignment of JavaThread
5172   // pointers to allow age to be placed into low bits
5173   // First check to see whether biasing is even enabled for this object
5174   Label cas_label;
5175   int null_check_offset = -1;
5176   if (!swap_reg_contains_mark) {
5177     null_check_offset = offset();
5178     movq(swap_reg, mark_addr);
5179   }
5180   movq(tmp_reg, swap_reg);
5181   andq(tmp_reg, markOopDesc::biased_lock_mask_in_place);
5182   cmpq(tmp_reg, markOopDesc::biased_lock_pattern);
5183   jcc(Assembler::notEqual, cas_label);
5184   // The bias pattern is present in the object's header. Need to check
5185   // whether the bias owner and the epoch are both still current.
5186   load_prototype_header(tmp_reg, obj_reg);
5187   orq(tmp_reg, r15_thread);
5188   xorq(tmp_reg, swap_reg);
5189   andq(tmp_reg, ~((int) markOopDesc::age_mask_in_place));
5190   if (counters != NULL) {
5191     cond_inc32(Assembler::zero,
5192                ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
5193   }
5194   jcc(Assembler::equal, done);
5195 
5196   Label try_revoke_bias;
5197   Label try_rebias;
5198 
5199   // At this point we know that the header has the bias pattern and
5200   // that we are not the bias owner in the current epoch. We need to
5201   // figure out more details about the state of the header in order to
5202   // know what operations can be legally performed on the object's
5203   // header.
5204 
5205   // If the low three bits in the xor result aren't clear, that means
5206   // the prototype header is no longer biased and we have to revoke
5207   // the bias on this object.
5208   testq(tmp_reg, markOopDesc::biased_lock_mask_in_place);
5209   jcc(Assembler::notZero, try_revoke_bias);
5210 
5211   // Biasing is still enabled for this data type. See whether the
5212   // epoch of the current bias is still valid, meaning that the epoch
5213   // bits of the mark word are equal to the epoch bits of the
5214   // prototype header. (Note that the prototype header's epoch bits
5215   // only change at a safepoint.) If not, attempt to rebias the object
5216   // toward the current thread. Note that we must be absolutely sure
5217   // that the current epoch is invalid in order to do this because
5218   // otherwise the manipulations it performs on the mark word are
5219   // illegal.
5220   testq(tmp_reg, markOopDesc::epoch_mask_in_place);
5221   jcc(Assembler::notZero, try_rebias);
5222 
5223   // The epoch of the current bias is still valid but we know nothing
5224   // about the owner; it might be set or it might be clear. Try to
5225   // acquire the bias of the object using an atomic operation. If this
5226   // fails we will go in to the runtime to revoke the object's bias.
5227   // Note that we first construct the presumed unbiased header so we
5228   // don't accidentally blow away another thread's valid bias.
5229   andq(swap_reg,
5230        markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
5231   movq(tmp_reg, swap_reg);
5232   orq(tmp_reg, r15_thread);
5233   if (os::is_MP()) {
5234     lock();
5235   }
5236   cmpxchgq(tmp_reg, Address(obj_reg, 0));
5237   // If the biasing toward our thread failed, this means that
5238   // another thread succeeded in biasing it toward itself and we
5239   // need to revoke that bias. The revocation will occur in the
5240   // interpreter runtime in the slow case.
5241   if (counters != NULL) {
5242     cond_inc32(Assembler::zero,
5243                ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
5244   }
5245   if (slow_case != NULL) {
5246     jcc(Assembler::notZero, *slow_case);
5247   }
5248   jmp(done);
5249 
5250   bind(try_rebias);
5251   // At this point we know the epoch has expired, meaning that the
5252   // current "bias owner", if any, is actually invalid. Under these
5253   // circumstances _only_, we are allowed to use the current header's
5254   // value as the comparison value when doing the cas to acquire the
5255   // bias in the current epoch. In other words, we allow transfer of
5256   // the bias from one thread to another directly in this situation.
5257   //
5258   // FIXME: due to a lack of registers we currently blow away the age
5259   // bits in this situation. Should attempt to preserve them.
5260   load_prototype_header(tmp_reg, obj_reg);
5261   orq(tmp_reg, r15_thread);
5262   if (os::is_MP()) {
5263     lock();
5264   }
5265   cmpxchgq(tmp_reg, Address(obj_reg, 0));
5266   // If the biasing toward our thread failed, then another thread
5267   // succeeded in biasing it toward itself and we need to revoke that
5268   // bias. The revocation will occur in the runtime in the slow case.
5269   if (counters != NULL) {
5270     cond_inc32(Assembler::zero,
5271                ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
5272   }
5273   if (slow_case != NULL) {
5274     jcc(Assembler::notZero, *slow_case);
5275   }
5276   jmp(done);
5277 
5278   bind(try_revoke_bias);
5279   // The prototype mark in the klass doesn't have the bias bit set any
5280   // more, indicating that objects of this data type are not supposed
5281   // to be biased any more. We are going to try to reset the mark of
5282   // this object to the prototype value and fall through to the
5283   // CAS-based locking scheme. Note that if our CAS fails, it means
5284   // that another thread raced us for the privilege of revoking the
5285   // bias of this particular object, so it's okay to continue in the
5286   // normal locking code.
5287   //
5288   // FIXME: due to a lack of registers we currently blow away the age
5289   // bits in this situation. Should attempt to preserve them.
5290   load_prototype_header(tmp_reg, obj_reg);
5291   if (os::is_MP()) {
5292     lock();
5293   }
5294   cmpxchgq(tmp_reg, Address(obj_reg, 0));
5295   // Fall through to the normal CAS-based lock, because no matter what
5296   // the result of the above CAS, some thread must have succeeded in
5297   // removing the bias bit from the object's header.
5298   if (counters != NULL) {
5299     cond_inc32(Assembler::zero,
5300                ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
5301   }
5302 
5303   bind(cas_label);
5304 
5305   return null_check_offset;
5306 }
5307 
5308 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
5309   Label L, E;
5310 
5311 #ifdef _WIN64
5312   // Windows always allocates space for it's register args
5313   assert(num_args <= 4, "only register arguments supported");
5314   subq(rsp,  frame::arg_reg_save_area_bytes);
5315 #endif
5316 
5317   // Align stack if necessary
5318   testl(rsp, 15);
5319   jcc(Assembler::zero, L);
5320 
5321   subq(rsp, 8);
5322   {
5323     call(RuntimeAddress(entry_point));
5324   }
5325   addq(rsp, 8);
5326   jmp(E);
5327 
5328   bind(L);
5329   {
5330     call(RuntimeAddress(entry_point));
5331   }
5332 
5333   bind(E);
5334 
5335 #ifdef _WIN64
5336   // restore stack pointer
5337   addq(rsp, frame::arg_reg_save_area_bytes);
5338 #endif
5339 
5340 }
5341 
5342 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
5343   assert(!src2.is_lval(), "should use cmpptr");
5344 
5345   if (reachable(src2)) {
5346     cmpq(src1, as_Address(src2));
5347   } else {
5348     lea(rscratch1, src2);
5349     Assembler::cmpq(src1, Address(rscratch1, 0));
5350   }
5351 }
5352 
5353 int MacroAssembler::corrected_idivq(Register reg) {
5354   // Full implementation of Java ldiv and lrem; checks for special
5355   // case as described in JVM spec., p.243 & p.271.  The function
5356   // returns the (pc) offset of the idivl instruction - may be needed
5357   // for implicit exceptions.
5358   //
5359   //         normal case                           special case
5360   //
5361   // input : rax: dividend                         min_long
5362   //         reg: divisor   (may not be eax/edx)   -1
5363   //
5364   // output: rax: quotient  (= rax idiv reg)       min_long
5365   //         rdx: remainder (= rax irem reg)       0
5366   assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
5367   static const int64_t min_long = 0x8000000000000000;
5368   Label normal_case, special_case;
5369 
5370   // check for special case
5371   cmp64(rax, ExternalAddress((address) &min_long));
5372   jcc(Assembler::notEqual, normal_case);
5373   xorl(rdx, rdx); // prepare rdx for possible special case (where
5374                   // remainder = 0)
5375   cmpq(reg, -1);
5376   jcc(Assembler::equal, special_case);
5377 
5378   // handle normal case
5379   bind(normal_case);
5380   cdqq();
5381   int idivq_offset = offset();
5382   idivq(reg);
5383 
5384   // normal and special case exit
5385   bind(special_case);
5386 
5387   return idivq_offset;
5388 }
5389 
5390 void MacroAssembler::decrementq(Register reg, int value) {
5391   if (value == min_jint) { subq(reg, value); return; }
5392   if (value <  0) { incrementq(reg, -value); return; }
5393   if (value == 0) {                        ; return; }
5394   if (value == 1 && UseIncDec) { decq(reg) ; return; }
5395   /* else */      { subq(reg, value)       ; return; }
5396 }
5397 
5398 void MacroAssembler::decrementq(Address dst, int value) {
5399   if (value == min_jint) { subq(dst, value); return; }
5400   if (value <  0) { incrementq(dst, -value); return; }
5401   if (value == 0) {                        ; return; }
5402   if (value == 1 && UseIncDec) { decq(dst) ; return; }
5403   /* else */      { subq(dst, value)       ; return; }
5404 }
5405 
5406 void MacroAssembler::fat_nop() {
5407   // A 5 byte nop that is safe for patching (see patch_verified_entry)
5408   // Recommened sequence from 'Software Optimization Guide for the AMD
5409   // Hammer Processor'
5410   emit_byte(0x66);
5411   emit_byte(0x66);
5412   emit_byte(0x90);
5413   emit_byte(0x66);
5414   emit_byte(0x90);
5415 }
5416 
5417 void MacroAssembler::incrementq(Register reg, int value) {
5418   if (value == min_jint) { addq(reg, value); return; }
5419   if (value <  0) { decrementq(reg, -value); return; }
5420   if (value == 0) {                        ; return; }
5421   if (value == 1 && UseIncDec) { incq(reg) ; return; }
5422   /* else */      { addq(reg, value)       ; return; }
5423 }
5424 
5425 void MacroAssembler::incrementq(Address dst, int value) {
5426   if (value == min_jint) { addq(dst, value); return; }
5427   if (value <  0) { decrementq(dst, -value); return; }
5428   if (value == 0) {                        ; return; }
5429   if (value == 1 && UseIncDec) { incq(dst) ; return; }
5430   /* else */      { addq(dst, value)       ; return; }
5431 }
5432 
5433 // 32bit can do a case table jump in one instruction but we no longer allow the base
5434 // to be installed in the Address class
5435 void MacroAssembler::jump(ArrayAddress entry) {
5436   lea(rscratch1, entry.base());
5437   Address dispatch = entry.index();
5438   assert(dispatch._base == noreg, "must be");
5439   dispatch._base = rscratch1;
5440   jmp(dispatch);
5441 }
5442 
5443 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
5444   ShouldNotReachHere(); // 64bit doesn't use two regs
5445   cmpq(x_lo, y_lo);
5446 }
5447 
5448 void MacroAssembler::lea(Register dst, AddressLiteral src) {
5449     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
5450 }
5451 
5452 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
5453   mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
5454   movptr(dst, rscratch1);
5455 }
5456 
5457 void MacroAssembler::leave() {
5458   // %%% is this really better? Why not on 32bit too?
5459   emit_byte(0xC9); // LEAVE
5460 }
5461 
5462 void MacroAssembler::lneg(Register hi, Register lo) {
5463   ShouldNotReachHere(); // 64bit doesn't use two regs
5464   negq(lo);
5465 }
5466 
5467 void MacroAssembler::movoop(Register dst, jobject obj) {
5468   mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
5469 }
5470 
5471 void MacroAssembler::movoop(Address dst, jobject obj) {
5472   mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
5473   movq(dst, rscratch1);
5474 }
5475 
5476 void MacroAssembler::movptr(Register dst, AddressLiteral src) {
5477   if (src.is_lval()) {
5478     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
5479   } else {
5480     if (reachable(src)) {
5481       movq(dst, as_Address(src));
5482     } else {
5483       lea(rscratch1, src);
5484       movq(dst, Address(rscratch1,0));
5485     }
5486   }
5487 }
5488 
5489 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
5490   movq(as_Address(dst), src);
5491 }
5492 
5493 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
5494   movq(dst, as_Address(src));
5495 }
5496 
5497 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
5498 void MacroAssembler::movptr(Address dst, intptr_t src) {
5499   mov64(rscratch1, src);
5500   movq(dst, rscratch1);
5501 }
5502 
5503 // These are mostly for initializing NULL
5504 void MacroAssembler::movptr(Address dst, int32_t src) {
5505   movslq(dst, src);
5506 }
5507 
5508 void MacroAssembler::movptr(Register dst, int32_t src) {
5509   mov64(dst, (intptr_t)src);
5510 }
5511 
5512 void MacroAssembler::pushoop(jobject obj) {
5513   movoop(rscratch1, obj);
5514   push(rscratch1);
5515 }
5516 
5517 void MacroAssembler::pushptr(AddressLiteral src) {
5518   lea(rscratch1, src);
5519   if (src.is_lval()) {
5520     push(rscratch1);
5521   } else {
5522     pushq(Address(rscratch1, 0));
5523   }
5524 }
5525 
5526 void MacroAssembler::reset_last_Java_frame(bool clear_fp,
5527                                            bool clear_pc) {
5528   // we must set sp to zero to clear frame
5529   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
5530   // must clear fp, so that compiled frames are not confused; it is
5531   // possible that we need it only for debugging
5532   if (clear_fp) {
5533     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
5534   }
5535 
5536   if (clear_pc) {
5537     movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
5538   }
5539 }
5540 
5541 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
5542                                          Register last_java_fp,
5543                                          address  last_java_pc) {
5544   // determine last_java_sp register
5545   if (!last_java_sp->is_valid()) {
5546     last_java_sp = rsp;
5547   }
5548 
5549   // last_java_fp is optional
5550   if (last_java_fp->is_valid()) {
5551     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
5552            last_java_fp);
5553   }
5554 
5555   // last_java_pc is optional
5556   if (last_java_pc != NULL) {
5557     Address java_pc(r15_thread,
5558                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
5559     lea(rscratch1, InternalAddress(last_java_pc));
5560     movptr(java_pc, rscratch1);
5561   }
5562 
5563   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
5564 }
5565 
5566 static void pass_arg0(MacroAssembler* masm, Register arg) {
5567   if (c_rarg0 != arg ) {
5568     masm->mov(c_rarg0, arg);
5569   }
5570 }
5571 
5572 static void pass_arg1(MacroAssembler* masm, Register arg) {
5573   if (c_rarg1 != arg ) {
5574     masm->mov(c_rarg1, arg);
5575   }
5576 }
5577 
5578 static void pass_arg2(MacroAssembler* masm, Register arg) {
5579   if (c_rarg2 != arg ) {
5580     masm->mov(c_rarg2, arg);
5581   }
5582 }
5583 
5584 static void pass_arg3(MacroAssembler* masm, Register arg) {
5585   if (c_rarg3 != arg ) {
5586     masm->mov(c_rarg3, arg);
5587   }
5588 }
5589 
5590 void MacroAssembler::stop(const char* msg) {
5591   address rip = pc();
5592   pusha(); // get regs on stack
5593   lea(c_rarg0, ExternalAddress((address) msg));
5594   lea(c_rarg1, InternalAddress(rip));
5595   movq(c_rarg2, rsp); // pass pointer to regs array
5596   andq(rsp, -16); // align stack as required by ABI
5597   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
5598   hlt();
5599 }
5600 
5601 void MacroAssembler::warn(const char* msg) {
5602   push(rsp);
5603   andq(rsp, -16);     // align stack as required by push_CPU_state and call
5604 
5605   push_CPU_state();   // keeps alignment at 16 bytes
5606   lea(c_rarg0, ExternalAddress((address) msg));
5607   call_VM_leaf(CAST_FROM_FN_PTR(address, warning), c_rarg0);
5608   pop_CPU_state();
5609   pop(rsp);
5610 }
5611 
5612 #ifndef PRODUCT
5613 extern "C" void findpc(intptr_t x);
5614 #endif
5615 
5616 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
5617   // In order to get locks to work, we need to fake a in_VM state
5618   if (ShowMessageBoxOnError ) {
5619     JavaThread* thread = JavaThread::current();
5620     JavaThreadState saved_state = thread->thread_state();
5621     thread->set_thread_state(_thread_in_vm);
5622 #ifndef PRODUCT
5623     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
5624       ttyLocker ttyl;
5625       BytecodeCounter::print();
5626     }
5627 #endif
5628     // To see where a verify_oop failed, get $ebx+40/X for this frame.
5629     // XXX correct this offset for amd64
5630     // This is the value of eip which points to where verify_oop will return.
5631     if (os::message_box(msg, "Execution stopped, print registers?")) {
5632       ttyLocker ttyl;
5633       tty->print_cr("rip = 0x%016lx", pc);
5634 #ifndef PRODUCT
5635       tty->cr();
5636       findpc(pc);
5637       tty->cr();
5638 #endif
5639       tty->print_cr("rax = 0x%016lx", regs[15]);
5640       tty->print_cr("rbx = 0x%016lx", regs[12]);
5641       tty->print_cr("rcx = 0x%016lx", regs[14]);
5642       tty->print_cr("rdx = 0x%016lx", regs[13]);
5643       tty->print_cr("rdi = 0x%016lx", regs[8]);
5644       tty->print_cr("rsi = 0x%016lx", regs[9]);
5645       tty->print_cr("rbp = 0x%016lx", regs[10]);
5646       tty->print_cr("rsp = 0x%016lx", regs[11]);
5647       tty->print_cr("r8  = 0x%016lx", regs[7]);
5648       tty->print_cr("r9  = 0x%016lx", regs[6]);
5649       tty->print_cr("r10 = 0x%016lx", regs[5]);
5650       tty->print_cr("r11 = 0x%016lx", regs[4]);
5651       tty->print_cr("r12 = 0x%016lx", regs[3]);
5652       tty->print_cr("r13 = 0x%016lx", regs[2]);
5653       tty->print_cr("r14 = 0x%016lx", regs[1]);
5654       tty->print_cr("r15 = 0x%016lx", regs[0]);
5655       BREAKPOINT;
5656     }
5657     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
5658   } else {
5659     ttyLocker ttyl;
5660     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
5661                     msg);
5662     assert(false, err_msg("DEBUG MESSAGE: %s", msg));
5663   }
5664 }
5665 
5666 #endif // _LP64
5667 
5668 // Now versions that are common to 32/64 bit
5669 
5670 void MacroAssembler::addptr(Register dst, int32_t imm32) {
5671   LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
5672 }
5673 
5674 void MacroAssembler::addptr(Register dst, Register src) {
5675   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
5676 }
5677 
5678 void MacroAssembler::addptr(Address dst, Register src) {
5679   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
5680 }
5681 
5682 void MacroAssembler::align(int modulus) {
5683   if (offset() % modulus != 0) {
5684     nop(modulus - (offset() % modulus));
5685   }
5686 }
5687 
5688 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src) {
5689   if (reachable(src)) {
5690     andpd(dst, as_Address(src));
5691   } else {
5692     lea(rscratch1, src);
5693     andpd(dst, Address(rscratch1, 0));
5694   }
5695 }
5696 
5697 void MacroAssembler::andptr(Register dst, int32_t imm32) {
5698   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
5699 }
5700 
5701 void MacroAssembler::atomic_incl(AddressLiteral counter_addr) {
5702   pushf();
5703   if (os::is_MP())
5704     lock();
5705   incrementl(counter_addr);
5706   popf();
5707 }
5708 
5709 // Writes to stack successive pages until offset reached to check for
5710 // stack overflow + shadow pages.  This clobbers tmp.
5711 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
5712   movptr(tmp, rsp);
5713   // Bang stack for total size given plus shadow page size.
5714   // Bang one page at a time because large size can bang beyond yellow and
5715   // red zones.
5716   Label loop;
5717   bind(loop);
5718   movl(Address(tmp, (-os::vm_page_size())), size );
5719   subptr(tmp, os::vm_page_size());
5720   subl(size, os::vm_page_size());
5721   jcc(Assembler::greater, loop);
5722 
5723   // Bang down shadow pages too.
5724   // The -1 because we already subtracted 1 page.
5725   for (int i = 0; i< StackShadowPages-1; i++) {
5726     // this could be any sized move but this is can be a debugging crumb
5727     // so the bigger the better.
5728     movptr(Address(tmp, (-i*os::vm_page_size())), size );
5729   }
5730 }
5731 
5732 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
5733   assert(UseBiasedLocking, "why call this otherwise?");
5734 
5735   // Check for biased locking unlock case, which is a no-op
5736   // Note: we do not have to check the thread ID for two reasons.
5737   // First, the interpreter checks for IllegalMonitorStateException at
5738   // a higher level. Second, if the bias was revoked while we held the
5739   // lock, the object could not be rebiased toward another thread, so
5740   // the bias bit would be clear.
5741   movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
5742   andptr(temp_reg, markOopDesc::biased_lock_mask_in_place);
5743   cmpptr(temp_reg, markOopDesc::biased_lock_pattern);
5744   jcc(Assembler::equal, done);
5745 }
5746 
5747 void MacroAssembler::c2bool(Register x) {
5748   // implements x == 0 ? 0 : 1
5749   // note: must only look at least-significant byte of x
5750   //       since C-style booleans are stored in one byte
5751   //       only! (was bug)
5752   andl(x, 0xFF);
5753   setb(Assembler::notZero, x);
5754 }
5755 
5756 // Wouldn't need if AddressLiteral version had new name
5757 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
5758   Assembler::call(L, rtype);
5759 }
5760 
5761 void MacroAssembler::call(Register entry) {
5762   Assembler::call(entry);
5763 }
5764 
5765 void MacroAssembler::call(AddressLiteral entry) {
5766   if (reachable(entry)) {
5767     Assembler::call_literal(entry.target(), entry.rspec());
5768   } else {
5769     lea(rscratch1, entry);
5770     Assembler::call(rscratch1);
5771   }
5772 }
5773 
5774 // Implementation of call_VM versions
5775 
5776 void MacroAssembler::call_VM(Register oop_result,
5777                              address entry_point,
5778                              bool check_exceptions) {
5779   Label C, E;
5780   call(C, relocInfo::none);
5781   jmp(E);
5782 
5783   bind(C);
5784   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
5785   ret(0);
5786 
5787   bind(E);
5788 }
5789 
5790 void MacroAssembler::call_VM(Register oop_result,
5791                              address entry_point,
5792                              Register arg_1,
5793                              bool check_exceptions) {
5794   Label C, E;
5795   call(C, relocInfo::none);
5796   jmp(E);
5797 
5798   bind(C);
5799   pass_arg1(this, arg_1);
5800   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
5801   ret(0);
5802 
5803   bind(E);
5804 }
5805 
5806 void MacroAssembler::call_VM(Register oop_result,
5807                              address entry_point,
5808                              Register arg_1,
5809                              Register arg_2,
5810                              bool check_exceptions) {
5811   Label C, E;
5812   call(C, relocInfo::none);
5813   jmp(E);
5814 
5815   bind(C);
5816 
5817   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
5818 
5819   pass_arg2(this, arg_2);
5820   pass_arg1(this, arg_1);
5821   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
5822   ret(0);
5823 
5824   bind(E);
5825 }
5826 
5827 void MacroAssembler::call_VM(Register oop_result,
5828                              address entry_point,
5829                              Register arg_1,
5830                              Register arg_2,
5831                              Register arg_3,
5832                              bool check_exceptions) {
5833   Label C, E;
5834   call(C, relocInfo::none);
5835   jmp(E);
5836 
5837   bind(C);
5838 
5839   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
5840   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
5841   pass_arg3(this, arg_3);
5842 
5843   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
5844   pass_arg2(this, arg_2);
5845 
5846   pass_arg1(this, arg_1);
5847   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
5848   ret(0);
5849 
5850   bind(E);
5851 }
5852 
5853 void MacroAssembler::call_VM(Register oop_result,
5854                              Register last_java_sp,
5855                              address entry_point,
5856                              int number_of_arguments,
5857                              bool check_exceptions) {
5858   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
5859   call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
5860 }
5861 
5862 void MacroAssembler::call_VM(Register oop_result,
5863                              Register last_java_sp,
5864                              address entry_point,
5865                              Register arg_1,
5866                              bool check_exceptions) {
5867   pass_arg1(this, arg_1);
5868   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
5869 }
5870 
5871 void MacroAssembler::call_VM(Register oop_result,
5872                              Register last_java_sp,
5873                              address entry_point,
5874                              Register arg_1,
5875                              Register arg_2,
5876                              bool check_exceptions) {
5877 
5878   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
5879   pass_arg2(this, arg_2);
5880   pass_arg1(this, arg_1);
5881   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
5882 }
5883 
5884 void MacroAssembler::call_VM(Register oop_result,
5885                              Register last_java_sp,
5886                              address entry_point,
5887                              Register arg_1,
5888                              Register arg_2,
5889                              Register arg_3,
5890                              bool check_exceptions) {
5891   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
5892   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
5893   pass_arg3(this, arg_3);
5894   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
5895   pass_arg2(this, arg_2);
5896   pass_arg1(this, arg_1);
5897   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
5898 }
5899 
5900 void MacroAssembler::super_call_VM(Register oop_result,
5901                                    Register last_java_sp,
5902                                    address entry_point,
5903                                    int number_of_arguments,
5904                                    bool check_exceptions) {
5905   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
5906   MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
5907 }
5908 
5909 void MacroAssembler::super_call_VM(Register oop_result,
5910                                    Register last_java_sp,
5911                                    address entry_point,
5912                                    Register arg_1,
5913                                    bool check_exceptions) {
5914   pass_arg1(this, arg_1);
5915   super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
5916 }
5917 
5918 void MacroAssembler::super_call_VM(Register oop_result,
5919                                    Register last_java_sp,
5920                                    address entry_point,
5921                                    Register arg_1,
5922                                    Register arg_2,
5923                                    bool check_exceptions) {
5924 
5925   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
5926   pass_arg2(this, arg_2);
5927   pass_arg1(this, arg_1);
5928   super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
5929 }
5930 
5931 void MacroAssembler::super_call_VM(Register oop_result,
5932                                    Register last_java_sp,
5933                                    address entry_point,
5934                                    Register arg_1,
5935                                    Register arg_2,
5936                                    Register arg_3,
5937                                    bool check_exceptions) {
5938   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
5939   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
5940   pass_arg3(this, arg_3);
5941   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
5942   pass_arg2(this, arg_2);
5943   pass_arg1(this, arg_1);
5944   super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
5945 }
5946 
5947 void MacroAssembler::call_VM_base(Register oop_result,
5948                                   Register java_thread,
5949                                   Register last_java_sp,
5950                                   address  entry_point,
5951                                   int      number_of_arguments,
5952                                   bool     check_exceptions) {
5953   // determine java_thread register
5954   if (!java_thread->is_valid()) {
5955 #ifdef _LP64
5956     java_thread = r15_thread;
5957 #else
5958     java_thread = rdi;
5959     get_thread(java_thread);
5960 #endif // LP64
5961   }
5962   // determine last_java_sp register
5963   if (!last_java_sp->is_valid()) {
5964     last_java_sp = rsp;
5965   }
5966   // debugging support
5967   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
5968   LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
5969 #ifdef ASSERT
5970   LP64_ONLY(if (UseCompressedOops) verify_heapbase("call_VM_base");)
5971 #endif // ASSERT
5972 
5973   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
5974   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
5975 
5976   // push java thread (becomes first argument of C function)
5977 
5978   NOT_LP64(push(java_thread); number_of_arguments++);
5979   LP64_ONLY(mov(c_rarg0, r15_thread));
5980 
5981   // set last Java frame before call
5982   assert(last_java_sp != rbp, "can't use ebp/rbp");
5983 
5984   // Only interpreter should have to set fp
5985   set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
5986 
5987   // do the call, remove parameters
5988   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
5989 
5990   // restore the thread (cannot use the pushed argument since arguments
5991   // may be overwritten by C code generated by an optimizing compiler);
5992   // however can use the register value directly if it is callee saved.
5993   if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
5994     // rdi & rsi (also r15) are callee saved -> nothing to do
5995 #ifdef ASSERT
5996     guarantee(java_thread != rax, "change this code");
5997     push(rax);
5998     { Label L;
5999       get_thread(rax);
6000       cmpptr(java_thread, rax);
6001       jcc(Assembler::equal, L);
6002       stop("MacroAssembler::call_VM_base: rdi not callee saved?");
6003       bind(L);
6004     }
6005     pop(rax);
6006 #endif
6007   } else {
6008     get_thread(java_thread);
6009   }
6010   // reset last Java frame
6011   // Only interpreter should have to clear fp
6012   reset_last_Java_frame(java_thread, true, false);
6013 
6014 #ifndef CC_INTERP
6015    // C++ interp handles this in the interpreter
6016   check_and_handle_popframe(java_thread);
6017   check_and_handle_earlyret(java_thread);
6018 #endif /* CC_INTERP */
6019 
6020   if (check_exceptions) {
6021     // check for pending exceptions (java_thread is set upon return)
6022     cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
6023 #ifndef _LP64
6024     jump_cc(Assembler::notEqual,
6025             RuntimeAddress(StubRoutines::forward_exception_entry()));
6026 #else
6027     // This used to conditionally jump to forward_exception however it is
6028     // possible if we relocate that the branch will not reach. So we must jump
6029     // around so we can always reach
6030 
6031     Label ok;
6032     jcc(Assembler::equal, ok);
6033     jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
6034     bind(ok);
6035 #endif // LP64
6036   }
6037 
6038   // get oop result if there is one and reset the value in the thread
6039   if (oop_result->is_valid()) {
6040     movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
6041     movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
6042     verify_oop(oop_result, "broken oop in call_VM_base");
6043   }
6044 }
6045 
6046 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
6047 
6048   // Calculate the value for last_Java_sp
6049   // somewhat subtle. call_VM does an intermediate call
6050   // which places a return address on the stack just under the
6051   // stack pointer as the user finsihed with it. This allows
6052   // use to retrieve last_Java_pc from last_Java_sp[-1].
6053   // On 32bit we then have to push additional args on the stack to accomplish
6054   // the actual requested call. On 64bit call_VM only can use register args
6055   // so the only extra space is the return address that call_VM created.
6056   // This hopefully explains the calculations here.
6057 
6058 #ifdef _LP64
6059   // We've pushed one address, correct last_Java_sp
6060   lea(rax, Address(rsp, wordSize));
6061 #else
6062   lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
6063 #endif // LP64
6064 
6065   call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
6066 
6067 }
6068 
6069 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
6070   call_VM_leaf_base(entry_point, number_of_arguments);
6071 }
6072 
6073 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
6074   pass_arg0(this, arg_0);
6075   call_VM_leaf(entry_point, 1);
6076 }
6077 
6078 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
6079 
6080   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
6081   pass_arg1(this, arg_1);
6082   pass_arg0(this, arg_0);
6083   call_VM_leaf(entry_point, 2);
6084 }
6085 
6086 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
6087   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
6088   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6089   pass_arg2(this, arg_2);
6090   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
6091   pass_arg1(this, arg_1);
6092   pass_arg0(this, arg_0);
6093   call_VM_leaf(entry_point, 3);
6094 }
6095 
6096 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
6097   pass_arg0(this, arg_0);
6098   MacroAssembler::call_VM_leaf_base(entry_point, 1);
6099 }
6100 
6101 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
6102 
6103   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
6104   pass_arg1(this, arg_1);
6105   pass_arg0(this, arg_0);
6106   MacroAssembler::call_VM_leaf_base(entry_point, 2);
6107 }
6108 
6109 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
6110   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
6111   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6112   pass_arg2(this, arg_2);
6113   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
6114   pass_arg1(this, arg_1);
6115   pass_arg0(this, arg_0);
6116   MacroAssembler::call_VM_leaf_base(entry_point, 3);
6117 }
6118 
6119 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
6120   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
6121   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
6122   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
6123   pass_arg3(this, arg_3);
6124   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
6125   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6126   pass_arg2(this, arg_2);
6127   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
6128   pass_arg1(this, arg_1);
6129   pass_arg0(this, arg_0);
6130   MacroAssembler::call_VM_leaf_base(entry_point, 4);
6131 }
6132 
6133 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
6134 }
6135 
6136 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
6137 }
6138 
6139 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
6140   if (reachable(src1)) {
6141     cmpl(as_Address(src1), imm);
6142   } else {
6143     lea(rscratch1, src1);
6144     cmpl(Address(rscratch1, 0), imm);
6145   }
6146 }
6147 
6148 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
6149   assert(!src2.is_lval(), "use cmpptr");
6150   if (reachable(src2)) {
6151     cmpl(src1, as_Address(src2));
6152   } else {
6153     lea(rscratch1, src2);
6154     cmpl(src1, Address(rscratch1, 0));
6155   }
6156 }
6157 
6158 void MacroAssembler::cmp32(Register src1, int32_t imm) {
6159   Assembler::cmpl(src1, imm);
6160 }
6161 
6162 void MacroAssembler::cmp32(Register src1, Address src2) {
6163   Assembler::cmpl(src1, src2);
6164 }
6165 
6166 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
6167   ucomisd(opr1, opr2);
6168 
6169   Label L;
6170   if (unordered_is_less) {
6171     movl(dst, -1);
6172     jcc(Assembler::parity, L);
6173     jcc(Assembler::below , L);
6174     movl(dst, 0);
6175     jcc(Assembler::equal , L);
6176     increment(dst);
6177   } else { // unordered is greater
6178     movl(dst, 1);
6179     jcc(Assembler::parity, L);
6180     jcc(Assembler::above , L);
6181     movl(dst, 0);
6182     jcc(Assembler::equal , L);
6183     decrementl(dst);
6184   }
6185   bind(L);
6186 }
6187 
6188 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
6189   ucomiss(opr1, opr2);
6190 
6191   Label L;
6192   if (unordered_is_less) {
6193     movl(dst, -1);
6194     jcc(Assembler::parity, L);
6195     jcc(Assembler::below , L);
6196     movl(dst, 0);
6197     jcc(Assembler::equal , L);
6198     increment(dst);
6199   } else { // unordered is greater
6200     movl(dst, 1);
6201     jcc(Assembler::parity, L);
6202     jcc(Assembler::above , L);
6203     movl(dst, 0);
6204     jcc(Assembler::equal , L);
6205     decrementl(dst);
6206   }
6207   bind(L);
6208 }
6209 
6210 
6211 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
6212   if (reachable(src1)) {
6213     cmpb(as_Address(src1), imm);
6214   } else {
6215     lea(rscratch1, src1);
6216     cmpb(Address(rscratch1, 0), imm);
6217   }
6218 }
6219 
6220 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
6221 #ifdef _LP64
6222   if (src2.is_lval()) {
6223     movptr(rscratch1, src2);
6224     Assembler::cmpq(src1, rscratch1);
6225   } else if (reachable(src2)) {
6226     cmpq(src1, as_Address(src2));
6227   } else {
6228     lea(rscratch1, src2);
6229     Assembler::cmpq(src1, Address(rscratch1, 0));
6230   }
6231 #else
6232   if (src2.is_lval()) {
6233     cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
6234   } else {
6235     cmpl(src1, as_Address(src2));
6236   }
6237 #endif // _LP64
6238 }
6239 
6240 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
6241   assert(src2.is_lval(), "not a mem-mem compare");
6242 #ifdef _LP64
6243   // moves src2's literal address
6244   movptr(rscratch1, src2);
6245   Assembler::cmpq(src1, rscratch1);
6246 #else
6247   cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
6248 #endif // _LP64
6249 }
6250 
6251 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
6252   if (reachable(adr)) {
6253     if (os::is_MP())
6254       lock();
6255     cmpxchgptr(reg, as_Address(adr));
6256   } else {
6257     lea(rscratch1, adr);
6258     if (os::is_MP())
6259       lock();
6260     cmpxchgptr(reg, Address(rscratch1, 0));
6261   }
6262 }
6263 
6264 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
6265   LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
6266 }
6267 
6268 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
6269   if (reachable(src)) {
6270     comisd(dst, as_Address(src));
6271   } else {
6272     lea(rscratch1, src);
6273     comisd(dst, Address(rscratch1, 0));
6274   }
6275 }
6276 
6277 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
6278   if (reachable(src)) {
6279     comiss(dst, as_Address(src));
6280   } else {
6281     lea(rscratch1, src);
6282     comiss(dst, Address(rscratch1, 0));
6283   }
6284 }
6285 
6286 
6287 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
6288   Condition negated_cond = negate_condition(cond);
6289   Label L;
6290   jcc(negated_cond, L);
6291   atomic_incl(counter_addr);
6292   bind(L);
6293 }
6294 
6295 int MacroAssembler::corrected_idivl(Register reg) {
6296   // Full implementation of Java idiv and irem; checks for
6297   // special case as described in JVM spec., p.243 & p.271.
6298   // The function returns the (pc) offset of the idivl
6299   // instruction - may be needed for implicit exceptions.
6300   //
6301   //         normal case                           special case
6302   //
6303   // input : rax,: dividend                         min_int
6304   //         reg: divisor   (may not be rax,/rdx)   -1
6305   //
6306   // output: rax,: quotient  (= rax, idiv reg)       min_int
6307   //         rdx: remainder (= rax, irem reg)       0
6308   assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
6309   const int min_int = 0x80000000;
6310   Label normal_case, special_case;
6311 
6312   // check for special case
6313   cmpl(rax, min_int);
6314   jcc(Assembler::notEqual, normal_case);
6315   xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
6316   cmpl(reg, -1);
6317   jcc(Assembler::equal, special_case);
6318 
6319   // handle normal case
6320   bind(normal_case);
6321   cdql();
6322   int idivl_offset = offset();
6323   idivl(reg);
6324 
6325   // normal and special case exit
6326   bind(special_case);
6327 
6328   return idivl_offset;
6329 }
6330 
6331 
6332 
6333 void MacroAssembler::decrementl(Register reg, int value) {
6334   if (value == min_jint) {subl(reg, value) ; return; }
6335   if (value <  0) { incrementl(reg, -value); return; }
6336   if (value == 0) {                        ; return; }
6337   if (value == 1 && UseIncDec) { decl(reg) ; return; }
6338   /* else */      { subl(reg, value)       ; return; }
6339 }
6340 
6341 void MacroAssembler::decrementl(Address dst, int value) {
6342   if (value == min_jint) {subl(dst, value) ; return; }
6343   if (value <  0) { incrementl(dst, -value); return; }
6344   if (value == 0) {                        ; return; }
6345   if (value == 1 && UseIncDec) { decl(dst) ; return; }
6346   /* else */      { subl(dst, value)       ; return; }
6347 }
6348 
6349 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
6350   assert (shift_value > 0, "illegal shift value");
6351   Label _is_positive;
6352   testl (reg, reg);
6353   jcc (Assembler::positive, _is_positive);
6354   int offset = (1 << shift_value) - 1 ;
6355 
6356   if (offset == 1) {
6357     incrementl(reg);
6358   } else {
6359     addl(reg, offset);
6360   }
6361 
6362   bind (_is_positive);
6363   sarl(reg, shift_value);
6364 }
6365 
6366 // !defined(COMPILER2) is because of stupid core builds
6367 #if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2)
6368 void MacroAssembler::empty_FPU_stack() {
6369   if (VM_Version::supports_mmx()) {
6370     emms();
6371   } else {
6372     for (int i = 8; i-- > 0; ) ffree(i);
6373   }
6374 }
6375 #endif // !LP64 || C1 || !C2
6376 
6377 
6378 // Defines obj, preserves var_size_in_bytes
6379 void MacroAssembler::eden_allocate(Register obj,
6380                                    Register var_size_in_bytes,
6381                                    int con_size_in_bytes,
6382                                    Register t1,
6383                                    Label& slow_case) {
6384   assert(obj == rax, "obj must be in rax, for cmpxchg");
6385   assert_different_registers(obj, var_size_in_bytes, t1);
6386   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
6387     jmp(slow_case);
6388   } else {
6389     Register end = t1;
6390     Label retry;
6391     bind(retry);
6392     ExternalAddress heap_top((address) Universe::heap()->top_addr());
6393     movptr(obj, heap_top);
6394     if (var_size_in_bytes == noreg) {
6395       lea(end, Address(obj, con_size_in_bytes));
6396     } else {
6397       lea(end, Address(obj, var_size_in_bytes, Address::times_1));
6398     }
6399     // if end < obj then we wrapped around => object too long => slow case
6400     cmpptr(end, obj);
6401     jcc(Assembler::below, slow_case);
6402     cmpptr(end, ExternalAddress((address) Universe::heap()->end_addr()));
6403     jcc(Assembler::above, slow_case);
6404     // Compare obj with the top addr, and if still equal, store the new top addr in
6405     // end at the address of the top addr pointer. Sets ZF if was equal, and clears
6406     // it otherwise. Use lock prefix for atomicity on MPs.
6407     locked_cmpxchgptr(end, heap_top);
6408     jcc(Assembler::notEqual, retry);
6409   }
6410 }
6411 
6412 void MacroAssembler::enter() {
6413   push(rbp);
6414   mov(rbp, rsp);
6415 }
6416 
6417 void MacroAssembler::fcmp(Register tmp) {
6418   fcmp(tmp, 1, true, true);
6419 }
6420 
6421 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
6422   assert(!pop_right || pop_left, "usage error");
6423   if (VM_Version::supports_cmov()) {
6424     assert(tmp == noreg, "unneeded temp");
6425     if (pop_left) {
6426       fucomip(index);
6427     } else {
6428       fucomi(index);
6429     }
6430     if (pop_right) {
6431       fpop();
6432     }
6433   } else {
6434     assert(tmp != noreg, "need temp");
6435     if (pop_left) {
6436       if (pop_right) {
6437         fcompp();
6438       } else {
6439         fcomp(index);
6440       }
6441     } else {
6442       fcom(index);
6443     }
6444     // convert FPU condition into eflags condition via rax,
6445     save_rax(tmp);
6446     fwait(); fnstsw_ax();
6447     sahf();
6448     restore_rax(tmp);
6449   }
6450   // condition codes set as follows:
6451   //
6452   // CF (corresponds to C0) if x < y
6453   // PF (corresponds to C2) if unordered
6454   // ZF (corresponds to C3) if x = y
6455 }
6456 
6457 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
6458   fcmp2int(dst, unordered_is_less, 1, true, true);
6459 }
6460 
6461 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
6462   fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
6463   Label L;
6464   if (unordered_is_less) {
6465     movl(dst, -1);
6466     jcc(Assembler::parity, L);
6467     jcc(Assembler::below , L);
6468     movl(dst, 0);
6469     jcc(Assembler::equal , L);
6470     increment(dst);
6471   } else { // unordered is greater
6472     movl(dst, 1);
6473     jcc(Assembler::parity, L);
6474     jcc(Assembler::above , L);
6475     movl(dst, 0);
6476     jcc(Assembler::equal , L);
6477     decrementl(dst);
6478   }
6479   bind(L);
6480 }
6481 
6482 void MacroAssembler::fld_d(AddressLiteral src) {
6483   fld_d(as_Address(src));
6484 }
6485 
6486 void MacroAssembler::fld_s(AddressLiteral src) {
6487   fld_s(as_Address(src));
6488 }
6489 
6490 void MacroAssembler::fld_x(AddressLiteral src) {
6491   Assembler::fld_x(as_Address(src));
6492 }
6493 
6494 void MacroAssembler::fldcw(AddressLiteral src) {
6495   Assembler::fldcw(as_Address(src));
6496 }
6497 
6498 void MacroAssembler::fpop() {
6499   ffree();
6500   fincstp();
6501 }
6502 
6503 void MacroAssembler::fremr(Register tmp) {
6504   save_rax(tmp);
6505   { Label L;
6506     bind(L);
6507     fprem();
6508     fwait(); fnstsw_ax();
6509 #ifdef _LP64
6510     testl(rax, 0x400);
6511     jcc(Assembler::notEqual, L);
6512 #else
6513     sahf();
6514     jcc(Assembler::parity, L);
6515 #endif // _LP64
6516   }
6517   restore_rax(tmp);
6518   // Result is in ST0.
6519   // Note: fxch & fpop to get rid of ST1
6520   // (otherwise FPU stack could overflow eventually)
6521   fxch(1);
6522   fpop();
6523 }
6524 
6525 
6526 void MacroAssembler::incrementl(AddressLiteral dst) {
6527   if (reachable(dst)) {
6528     incrementl(as_Address(dst));
6529   } else {
6530     lea(rscratch1, dst);
6531     incrementl(Address(rscratch1, 0));
6532   }
6533 }
6534 
6535 void MacroAssembler::incrementl(ArrayAddress dst) {
6536   incrementl(as_Address(dst));
6537 }
6538 
6539 void MacroAssembler::incrementl(Register reg, int value) {
6540   if (value == min_jint) {addl(reg, value) ; return; }
6541   if (value <  0) { decrementl(reg, -value); return; }
6542   if (value == 0) {                        ; return; }
6543   if (value == 1 && UseIncDec) { incl(reg) ; return; }
6544   /* else */      { addl(reg, value)       ; return; }
6545 }
6546 
6547 void MacroAssembler::incrementl(Address dst, int value) {
6548   if (value == min_jint) {addl(dst, value) ; return; }
6549   if (value <  0) { decrementl(dst, -value); return; }
6550   if (value == 0) {                        ; return; }
6551   if (value == 1 && UseIncDec) { incl(dst) ; return; }
6552   /* else */      { addl(dst, value)       ; return; }
6553 }
6554 
6555 void MacroAssembler::jump(AddressLiteral dst) {
6556   if (reachable(dst)) {
6557     jmp_literal(dst.target(), dst.rspec());
6558   } else {
6559     lea(rscratch1, dst);
6560     jmp(rscratch1);
6561   }
6562 }
6563 
6564 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
6565   if (reachable(dst)) {
6566     InstructionMark im(this);
6567     relocate(dst.reloc());
6568     const int short_size = 2;
6569     const int long_size = 6;
6570     int offs = (intptr_t)dst.target() - ((intptr_t)_code_pos);
6571     if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
6572       // 0111 tttn #8-bit disp
6573       emit_byte(0x70 | cc);
6574       emit_byte((offs - short_size) & 0xFF);
6575     } else {
6576       // 0000 1111 1000 tttn #32-bit disp
6577       emit_byte(0x0F);
6578       emit_byte(0x80 | cc);
6579       emit_long(offs - long_size);
6580     }
6581   } else {
6582 #ifdef ASSERT
6583     warning("reversing conditional branch");
6584 #endif /* ASSERT */
6585     Label skip;
6586     jccb(reverse[cc], skip);
6587     lea(rscratch1, dst);
6588     Assembler::jmp(rscratch1);
6589     bind(skip);
6590   }
6591 }
6592 
6593 void MacroAssembler::ldmxcsr(AddressLiteral src) {
6594   if (reachable(src)) {
6595     Assembler::ldmxcsr(as_Address(src));
6596   } else {
6597     lea(rscratch1, src);
6598     Assembler::ldmxcsr(Address(rscratch1, 0));
6599   }
6600 }
6601 
6602 int MacroAssembler::load_signed_byte(Register dst, Address src) {
6603   int off;
6604   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
6605     off = offset();
6606     movsbl(dst, src); // movsxb
6607   } else {
6608     off = load_unsigned_byte(dst, src);
6609     shll(dst, 24);
6610     sarl(dst, 24);
6611   }
6612   return off;
6613 }
6614 
6615 // Note: load_signed_short used to be called load_signed_word.
6616 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
6617 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
6618 // The term "word" in HotSpot means a 32- or 64-bit machine word.
6619 int MacroAssembler::load_signed_short(Register dst, Address src) {
6620   int off;
6621   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
6622     // This is dubious to me since it seems safe to do a signed 16 => 64 bit
6623     // version but this is what 64bit has always done. This seems to imply
6624     // that users are only using 32bits worth.
6625     off = offset();
6626     movswl(dst, src); // movsxw
6627   } else {
6628     off = load_unsigned_short(dst, src);
6629     shll(dst, 16);
6630     sarl(dst, 16);
6631   }
6632   return off;
6633 }
6634 
6635 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
6636   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
6637   // and "3.9 Partial Register Penalties", p. 22).
6638   int off;
6639   if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
6640     off = offset();
6641     movzbl(dst, src); // movzxb
6642   } else {
6643     xorl(dst, dst);
6644     off = offset();
6645     movb(dst, src);
6646   }
6647   return off;
6648 }
6649 
6650 // Note: load_unsigned_short used to be called load_unsigned_word.
6651 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
6652   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
6653   // and "3.9 Partial Register Penalties", p. 22).
6654   int off;
6655   if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
6656     off = offset();
6657     movzwl(dst, src); // movzxw
6658   } else {
6659     xorl(dst, dst);
6660     off = offset();
6661     movw(dst, src);
6662   }
6663   return off;
6664 }
6665 
6666 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
6667   switch (size_in_bytes) {
6668 #ifndef _LP64
6669   case  8:
6670     assert(dst2 != noreg, "second dest register required");
6671     movl(dst,  src);
6672     movl(dst2, src.plus_disp(BytesPerInt));
6673     break;
6674 #else
6675   case  8:  movq(dst, src); break;
6676 #endif
6677   case  4:  movl(dst, src); break;
6678   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
6679   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
6680   default:  ShouldNotReachHere();
6681   }
6682 }
6683 
6684 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
6685   switch (size_in_bytes) {
6686 #ifndef _LP64
6687   case  8:
6688     assert(src2 != noreg, "second source register required");
6689     movl(dst,                        src);
6690     movl(dst.plus_disp(BytesPerInt), src2);
6691     break;
6692 #else
6693   case  8:  movq(dst, src); break;
6694 #endif
6695   case  4:  movl(dst, src); break;
6696   case  2:  movw(dst, src); break;
6697   case  1:  movb(dst, src); break;
6698   default:  ShouldNotReachHere();
6699   }
6700 }
6701 
6702 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
6703   if (reachable(dst)) {
6704     movl(as_Address(dst), src);
6705   } else {
6706     lea(rscratch1, dst);
6707     movl(Address(rscratch1, 0), src);
6708   }
6709 }
6710 
6711 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
6712   if (reachable(src)) {
6713     movl(dst, as_Address(src));
6714   } else {
6715     lea(rscratch1, src);
6716     movl(dst, Address(rscratch1, 0));
6717   }
6718 }
6719 
6720 // C++ bool manipulation
6721 
6722 void MacroAssembler::movbool(Register dst, Address src) {
6723   if(sizeof(bool) == 1)
6724     movb(dst, src);
6725   else if(sizeof(bool) == 2)
6726     movw(dst, src);
6727   else if(sizeof(bool) == 4)
6728     movl(dst, src);
6729   else
6730     // unsupported
6731     ShouldNotReachHere();
6732 }
6733 
6734 void MacroAssembler::movbool(Address dst, bool boolconst) {
6735   if(sizeof(bool) == 1)
6736     movb(dst, (int) boolconst);
6737   else if(sizeof(bool) == 2)
6738     movw(dst, (int) boolconst);
6739   else if(sizeof(bool) == 4)
6740     movl(dst, (int) boolconst);
6741   else
6742     // unsupported
6743     ShouldNotReachHere();
6744 }
6745 
6746 void MacroAssembler::movbool(Address dst, Register src) {
6747   if(sizeof(bool) == 1)
6748     movb(dst, src);
6749   else if(sizeof(bool) == 2)
6750     movw(dst, src);
6751   else if(sizeof(bool) == 4)
6752     movl(dst, src);
6753   else
6754     // unsupported
6755     ShouldNotReachHere();
6756 }
6757 
6758 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
6759   movb(as_Address(dst), src);
6760 }
6761 
6762 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
6763   if (reachable(src)) {
6764     if (UseXmmLoadAndClearUpper) {
6765       movsd (dst, as_Address(src));
6766     } else {
6767       movlpd(dst, as_Address(src));
6768     }
6769   } else {
6770     lea(rscratch1, src);
6771     if (UseXmmLoadAndClearUpper) {
6772       movsd (dst, Address(rscratch1, 0));
6773     } else {
6774       movlpd(dst, Address(rscratch1, 0));
6775     }
6776   }
6777 }
6778 
6779 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
6780   if (reachable(src)) {
6781     movss(dst, as_Address(src));
6782   } else {
6783     lea(rscratch1, src);
6784     movss(dst, Address(rscratch1, 0));
6785   }
6786 }
6787 
6788 void MacroAssembler::movptr(Register dst, Register src) {
6789   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
6790 }
6791 
6792 void MacroAssembler::movptr(Register dst, Address src) {
6793   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
6794 }
6795 
6796 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
6797 void MacroAssembler::movptr(Register dst, intptr_t src) {
6798   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
6799 }
6800 
6801 void MacroAssembler::movptr(Address dst, Register src) {
6802   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
6803 }
6804 
6805 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
6806   if (reachable(src)) {
6807     movss(dst, as_Address(src));
6808   } else {
6809     lea(rscratch1, src);
6810     movss(dst, Address(rscratch1, 0));
6811   }
6812 }
6813 
6814 void MacroAssembler::null_check(Register reg, int offset) {
6815   if (needs_explicit_null_check(offset)) {
6816     // provoke OS NULL exception if reg = NULL by
6817     // accessing M[reg] w/o changing any (non-CC) registers
6818     // NOTE: cmpl is plenty here to provoke a segv
6819     cmpptr(rax, Address(reg, 0));
6820     // Note: should probably use testl(rax, Address(reg, 0));
6821     //       may be shorter code (however, this version of
6822     //       testl needs to be implemented first)
6823   } else {
6824     // nothing to do, (later) access of M[reg + offset]
6825     // will provoke OS NULL exception if reg = NULL
6826   }
6827 }
6828 
6829 void MacroAssembler::os_breakpoint() {
6830   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
6831   // (e.g., MSVC can't call ps() otherwise)
6832   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
6833 }
6834 
6835 void MacroAssembler::pop_CPU_state() {
6836   pop_FPU_state();
6837   pop_IU_state();
6838 }
6839 
6840 void MacroAssembler::pop_FPU_state() {
6841   NOT_LP64(frstor(Address(rsp, 0));)
6842   LP64_ONLY(fxrstor(Address(rsp, 0));)
6843   addptr(rsp, FPUStateSizeInWords * wordSize);
6844 }
6845 
6846 void MacroAssembler::pop_IU_state() {
6847   popa();
6848   LP64_ONLY(addq(rsp, 8));
6849   popf();
6850 }
6851 
6852 // Save Integer and Float state
6853 // Warning: Stack must be 16 byte aligned (64bit)
6854 void MacroAssembler::push_CPU_state() {
6855   push_IU_state();
6856   push_FPU_state();
6857 }
6858 
6859 void MacroAssembler::push_FPU_state() {
6860   subptr(rsp, FPUStateSizeInWords * wordSize);
6861 #ifndef _LP64
6862   fnsave(Address(rsp, 0));
6863   fwait();
6864 #else
6865   fxsave(Address(rsp, 0));
6866 #endif // LP64
6867 }
6868 
6869 void MacroAssembler::push_IU_state() {
6870   // Push flags first because pusha kills them
6871   pushf();
6872   // Make sure rsp stays 16-byte aligned
6873   LP64_ONLY(subq(rsp, 8));
6874   pusha();
6875 }
6876 
6877 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp, bool clear_pc) {
6878   // determine java_thread register
6879   if (!java_thread->is_valid()) {
6880     java_thread = rdi;
6881     get_thread(java_thread);
6882   }
6883   // we must set sp to zero to clear frame
6884   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
6885   if (clear_fp) {
6886     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
6887   }
6888 
6889   if (clear_pc)
6890     movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
6891 
6892 }
6893 
6894 void MacroAssembler::restore_rax(Register tmp) {
6895   if (tmp == noreg) pop(rax);
6896   else if (tmp != rax) mov(rax, tmp);
6897 }
6898 
6899 void MacroAssembler::round_to(Register reg, int modulus) {
6900   addptr(reg, modulus - 1);
6901   andptr(reg, -modulus);
6902 }
6903 
6904 void MacroAssembler::save_rax(Register tmp) {
6905   if (tmp == noreg) push(rax);
6906   else if (tmp != rax) mov(tmp, rax);
6907 }
6908 
6909 // Write serialization page so VM thread can do a pseudo remote membar.
6910 // We use the current thread pointer to calculate a thread specific
6911 // offset to write to within the page. This minimizes bus traffic
6912 // due to cache line collision.
6913 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
6914   movl(tmp, thread);
6915   shrl(tmp, os::get_serialize_page_shift_count());
6916   andl(tmp, (os::vm_page_size() - sizeof(int)));
6917 
6918   Address index(noreg, tmp, Address::times_1);
6919   ExternalAddress page(os::get_memory_serialize_page());
6920 
6921   // Size of store must match masking code above
6922   movl(as_Address(ArrayAddress(page, index)), tmp);
6923 }
6924 
6925 // Calls to C land
6926 //
6927 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
6928 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
6929 // has to be reset to 0. This is required to allow proper stack traversal.
6930 void MacroAssembler::set_last_Java_frame(Register java_thread,
6931                                          Register last_java_sp,
6932                                          Register last_java_fp,
6933                                          address  last_java_pc) {
6934   // determine java_thread register
6935   if (!java_thread->is_valid()) {
6936     java_thread = rdi;
6937     get_thread(java_thread);
6938   }
6939   // determine last_java_sp register
6940   if (!last_java_sp->is_valid()) {
6941     last_java_sp = rsp;
6942   }
6943 
6944   // last_java_fp is optional
6945 
6946   if (last_java_fp->is_valid()) {
6947     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
6948   }
6949 
6950   // last_java_pc is optional
6951 
6952   if (last_java_pc != NULL) {
6953     lea(Address(java_thread,
6954                  JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
6955         InternalAddress(last_java_pc));
6956 
6957   }
6958   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
6959 }
6960 
6961 void MacroAssembler::shlptr(Register dst, int imm8) {
6962   LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
6963 }
6964 
6965 void MacroAssembler::shrptr(Register dst, int imm8) {
6966   LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
6967 }
6968 
6969 void MacroAssembler::sign_extend_byte(Register reg) {
6970   if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
6971     movsbl(reg, reg); // movsxb
6972   } else {
6973     shll(reg, 24);
6974     sarl(reg, 24);
6975   }
6976 }
6977 
6978 void MacroAssembler::sign_extend_short(Register reg) {
6979   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
6980     movswl(reg, reg); // movsxw
6981   } else {
6982     shll(reg, 16);
6983     sarl(reg, 16);
6984   }
6985 }
6986 
6987 void MacroAssembler::testl(Register dst, AddressLiteral src) {
6988   assert(reachable(src), "Address should be reachable");
6989   testl(dst, as_Address(src));
6990 }
6991 
6992 //////////////////////////////////////////////////////////////////////////////////
6993 #ifndef SERIALGC
6994 
6995 void MacroAssembler::g1_write_barrier_pre(Register obj,
6996                                           Register pre_val,
6997                                           Register thread,
6998                                           Register tmp,
6999                                           bool tosca_live,
7000                                           bool expand_call) {
7001 
7002   // If expand_call is true then we expand the call_VM_leaf macro
7003   // directly to skip generating the check by
7004   // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
7005 
7006 #ifdef _LP64
7007   assert(thread == r15_thread, "must be");
7008 #endif // _LP64
7009 
7010   Label done;
7011   Label runtime;
7012 
7013   assert(pre_val != noreg, "check this code");
7014 
7015   if (obj != noreg) {
7016     assert_different_registers(obj, pre_val, tmp);
7017     assert(pre_val != rax, "check this code");
7018   }
7019 
7020   Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
7021                                        PtrQueue::byte_offset_of_active()));
7022   Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
7023                                        PtrQueue::byte_offset_of_index()));
7024   Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
7025                                        PtrQueue::byte_offset_of_buf()));
7026 
7027 
7028   // Is marking active?
7029   if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
7030     cmpl(in_progress, 0);
7031   } else {
7032     assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
7033     cmpb(in_progress, 0);
7034   }
7035   jcc(Assembler::equal, done);
7036 
7037   // Do we need to load the previous value?
7038   if (obj != noreg) {
7039     load_heap_oop(pre_val, Address(obj, 0));
7040   }
7041 
7042   // Is the previous value null?
7043   cmpptr(pre_val, (int32_t) NULL_WORD);
7044   jcc(Assembler::equal, done);
7045 
7046   // Can we store original value in the thread's buffer?
7047   // Is index == 0?
7048   // (The index field is typed as size_t.)
7049 
7050   movptr(tmp, index);                   // tmp := *index_adr
7051   cmpptr(tmp, 0);                       // tmp == 0?
7052   jcc(Assembler::equal, runtime);       // If yes, goto runtime
7053 
7054   subptr(tmp, wordSize);                // tmp := tmp - wordSize
7055   movptr(index, tmp);                   // *index_adr := tmp
7056   addptr(tmp, buffer);                  // tmp := tmp + *buffer_adr
7057 
7058   // Record the previous value
7059   movptr(Address(tmp, 0), pre_val);
7060   jmp(done);
7061 
7062   bind(runtime);
7063   // save the live input values
7064   if(tosca_live) push(rax);
7065 
7066   if (obj != noreg && obj != rax)
7067     push(obj);
7068 
7069   if (pre_val != rax)
7070     push(pre_val);
7071 
7072   // Calling the runtime using the regular call_VM_leaf mechanism generates
7073   // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
7074   // that checks that the *(ebp+frame::interpreter_frame_last_sp) == NULL.
7075   //
7076   // If we care generating the pre-barrier without a frame (e.g. in the
7077   // intrinsified Reference.get() routine) then ebp might be pointing to
7078   // the caller frame and so this check will most likely fail at runtime.
7079   //
7080   // Expanding the call directly bypasses the generation of the check.
7081   // So when we do not have have a full interpreter frame on the stack
7082   // expand_call should be passed true.
7083 
7084   NOT_LP64( push(thread); )
7085 
7086   if (expand_call) {
7087     LP64_ONLY( assert(pre_val != c_rarg1, "smashed arg"); )
7088     pass_arg1(this, thread);
7089     pass_arg0(this, pre_val);
7090     MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
7091   } else {
7092     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
7093   }
7094 
7095   NOT_LP64( pop(thread); )
7096 
7097   // save the live input values
7098   if (pre_val != rax)
7099     pop(pre_val);
7100 
7101   if (obj != noreg && obj != rax)
7102     pop(obj);
7103 
7104   if(tosca_live) pop(rax);
7105 
7106   bind(done);
7107 }
7108 
7109 void MacroAssembler::g1_write_barrier_post(Register store_addr,
7110                                            Register new_val,
7111                                            Register thread,
7112                                            Register tmp,
7113                                            Register tmp2) {
7114 #ifdef _LP64
7115   assert(thread == r15_thread, "must be");
7116 #endif // _LP64
7117 
7118   Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
7119                                        PtrQueue::byte_offset_of_index()));
7120   Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
7121                                        PtrQueue::byte_offset_of_buf()));
7122 
7123   BarrierSet* bs = Universe::heap()->barrier_set();
7124   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
7125   Label done;
7126   Label runtime;
7127 
7128   // Does store cross heap regions?
7129 
7130   movptr(tmp, store_addr);
7131   xorptr(tmp, new_val);
7132   shrptr(tmp, HeapRegion::LogOfHRGrainBytes);
7133   jcc(Assembler::equal, done);
7134 
7135   // crosses regions, storing NULL?
7136 
7137   cmpptr(new_val, (int32_t) NULL_WORD);
7138   jcc(Assembler::equal, done);
7139 
7140   // storing region crossing non-NULL, is card already dirty?
7141 
7142   ExternalAddress cardtable((address) ct->byte_map_base);
7143   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
7144 #ifdef _LP64
7145   const Register card_addr = tmp;
7146 
7147   movq(card_addr, store_addr);
7148   shrq(card_addr, CardTableModRefBS::card_shift);
7149 
7150   lea(tmp2, cardtable);
7151 
7152   // get the address of the card
7153   addq(card_addr, tmp2);
7154 #else
7155   const Register card_index = tmp;
7156 
7157   movl(card_index, store_addr);
7158   shrl(card_index, CardTableModRefBS::card_shift);
7159 
7160   Address index(noreg, card_index, Address::times_1);
7161   const Register card_addr = tmp;
7162   lea(card_addr, as_Address(ArrayAddress(cardtable, index)));
7163 #endif
7164   cmpb(Address(card_addr, 0), 0);
7165   jcc(Assembler::equal, done);
7166 
7167   // storing a region crossing, non-NULL oop, card is clean.
7168   // dirty card and log.
7169 
7170   movb(Address(card_addr, 0), 0);
7171 
7172   cmpl(queue_index, 0);
7173   jcc(Assembler::equal, runtime);
7174   subl(queue_index, wordSize);
7175   movptr(tmp2, buffer);
7176 #ifdef _LP64
7177   movslq(rscratch1, queue_index);
7178   addq(tmp2, rscratch1);
7179   movq(Address(tmp2, 0), card_addr);
7180 #else
7181   addl(tmp2, queue_index);
7182   movl(Address(tmp2, 0), card_index);
7183 #endif
7184   jmp(done);
7185 
7186   bind(runtime);
7187   // save the live input values
7188   push(store_addr);
7189   push(new_val);
7190 #ifdef _LP64
7191   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, r15_thread);
7192 #else
7193   push(thread);
7194   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
7195   pop(thread);
7196 #endif
7197   pop(new_val);
7198   pop(store_addr);
7199 
7200   bind(done);
7201 }
7202 
7203 #endif // SERIALGC
7204 //////////////////////////////////////////////////////////////////////////////////
7205 
7206 
7207 void MacroAssembler::store_check(Register obj) {
7208   // Does a store check for the oop in register obj. The content of
7209   // register obj is destroyed afterwards.
7210   store_check_part_1(obj);
7211   store_check_part_2(obj);
7212 }
7213 
7214 void MacroAssembler::store_check(Register obj, Address dst) {
7215   store_check(obj);
7216 }
7217 
7218 
7219 // split the store check operation so that other instructions can be scheduled inbetween
7220 void MacroAssembler::store_check_part_1(Register obj) {
7221   BarrierSet* bs = Universe::heap()->barrier_set();
7222   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
7223   shrptr(obj, CardTableModRefBS::card_shift);
7224 }
7225 
7226 void MacroAssembler::store_check_part_2(Register obj) {
7227   BarrierSet* bs = Universe::heap()->barrier_set();
7228   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
7229   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
7230   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
7231 
7232   // The calculation for byte_map_base is as follows:
7233   // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
7234   // So this essentially converts an address to a displacement and
7235   // it will never need to be relocated. On 64bit however the value may be too
7236   // large for a 32bit displacement
7237 
7238   intptr_t disp = (intptr_t) ct->byte_map_base;
7239   if (is_simm32(disp)) {
7240     Address cardtable(noreg, obj, Address::times_1, disp);
7241     movb(cardtable, 0);
7242   } else {
7243     // By doing it as an ExternalAddress disp could be converted to a rip-relative
7244     // displacement and done in a single instruction given favorable mapping and
7245     // a smarter version of as_Address. Worst case it is two instructions which
7246     // is no worse off then loading disp into a register and doing as a simple
7247     // Address() as above.
7248     // We can't do as ExternalAddress as the only style since if disp == 0 we'll
7249     // assert since NULL isn't acceptable in a reloci (see 6644928). In any case
7250     // in some cases we'll get a single instruction version.
7251 
7252     ExternalAddress cardtable((address)disp);
7253     Address index(noreg, obj, Address::times_1);
7254     movb(as_Address(ArrayAddress(cardtable, index)), 0);
7255   }
7256 }
7257 
7258 void MacroAssembler::subptr(Register dst, int32_t imm32) {
7259   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
7260 }
7261 
7262 void MacroAssembler::subptr(Register dst, Register src) {
7263   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
7264 }
7265 
7266 // C++ bool manipulation
7267 void MacroAssembler::testbool(Register dst) {
7268   if(sizeof(bool) == 1)
7269     testb(dst, 0xff);
7270   else if(sizeof(bool) == 2) {
7271     // testw implementation needed for two byte bools
7272     ShouldNotReachHere();
7273   } else if(sizeof(bool) == 4)
7274     testl(dst, dst);
7275   else
7276     // unsupported
7277     ShouldNotReachHere();
7278 }
7279 
7280 void MacroAssembler::testptr(Register dst, Register src) {
7281   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
7282 }
7283 
7284 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
7285 void MacroAssembler::tlab_allocate(Register obj,
7286                                    Register var_size_in_bytes,
7287                                    int con_size_in_bytes,
7288                                    Register t1,
7289                                    Register t2,
7290                                    Label& slow_case) {
7291   assert_different_registers(obj, t1, t2);
7292   assert_different_registers(obj, var_size_in_bytes, t1);
7293   Register end = t2;
7294   Register thread = NOT_LP64(t1) LP64_ONLY(r15_thread);
7295 
7296   verify_tlab();
7297 
7298   NOT_LP64(get_thread(thread));
7299 
7300   movptr(obj, Address(thread, JavaThread::tlab_top_offset()));
7301   if (var_size_in_bytes == noreg) {
7302     lea(end, Address(obj, con_size_in_bytes));
7303   } else {
7304     lea(end, Address(obj, var_size_in_bytes, Address::times_1));
7305   }
7306   cmpptr(end, Address(thread, JavaThread::tlab_end_offset()));
7307   jcc(Assembler::above, slow_case);
7308 
7309   // update the tlab top pointer
7310   movptr(Address(thread, JavaThread::tlab_top_offset()), end);
7311 
7312   // recover var_size_in_bytes if necessary
7313   if (var_size_in_bytes == end) {
7314     subptr(var_size_in_bytes, obj);
7315   }
7316   verify_tlab();
7317 }
7318 
7319 // Preserves rbx, and rdx.
7320 Register MacroAssembler::tlab_refill(Label& retry,
7321                                      Label& try_eden,
7322                                      Label& slow_case) {
7323   Register top = rax;
7324   Register t1  = rcx;
7325   Register t2  = rsi;
7326   Register thread_reg = NOT_LP64(rdi) LP64_ONLY(r15_thread);
7327   assert_different_registers(top, thread_reg, t1, t2, /* preserve: */ rbx, rdx);
7328   Label do_refill, discard_tlab;
7329 
7330   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
7331     // No allocation in the shared eden.
7332     jmp(slow_case);
7333   }
7334 
7335   NOT_LP64(get_thread(thread_reg));
7336 
7337   movptr(top, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
7338   movptr(t1,  Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
7339 
7340   // calculate amount of free space
7341   subptr(t1, top);
7342   shrptr(t1, LogHeapWordSize);
7343 
7344   // Retain tlab and allocate object in shared space if
7345   // the amount free in the tlab is too large to discard.
7346   cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
7347   jcc(Assembler::lessEqual, discard_tlab);
7348 
7349   // Retain
7350   // %%% yuck as movptr...
7351   movptr(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
7352   addptr(Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())), t2);
7353   if (TLABStats) {
7354     // increment number of slow_allocations
7355     addl(Address(thread_reg, in_bytes(JavaThread::tlab_slow_allocations_offset())), 1);
7356   }
7357   jmp(try_eden);
7358 
7359   bind(discard_tlab);
7360   if (TLABStats) {
7361     // increment number of refills
7362     addl(Address(thread_reg, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1);
7363     // accumulate wastage -- t1 is amount free in tlab
7364     addl(Address(thread_reg, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1);
7365   }
7366 
7367   // if tlab is currently allocated (top or end != null) then
7368   // fill [top, end + alignment_reserve) with array object
7369   testptr(top, top);
7370   jcc(Assembler::zero, do_refill);
7371 
7372   // set up the mark word
7373   movptr(Address(top, oopDesc::mark_offset_in_bytes()), (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
7374   // set the length to the remaining space
7375   subptr(t1, typeArrayOopDesc::header_size(T_INT));
7376   addptr(t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
7377   shlptr(t1, log2_intptr(HeapWordSize/sizeof(jint)));
7378   movl(Address(top, arrayOopDesc::length_offset_in_bytes()), t1);
7379   // set klass to intArrayKlass
7380   // dubious reloc why not an oop reloc?
7381   movptr(t1, ExternalAddress((address)Universe::intArrayKlassObj_addr()));
7382   // store klass last.  concurrent gcs assumes klass length is valid if
7383   // klass field is not null.
7384   store_klass(top, t1);
7385 
7386   movptr(t1, top);
7387   subptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
7388   incr_allocated_bytes(thread_reg, t1, 0);
7389 
7390   // refill the tlab with an eden allocation
7391   bind(do_refill);
7392   movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
7393   shlptr(t1, LogHeapWordSize);
7394   // allocate new tlab, address returned in top
7395   eden_allocate(top, t1, 0, t2, slow_case);
7396 
7397   // Check that t1 was preserved in eden_allocate.
7398 #ifdef ASSERT
7399   if (UseTLAB) {
7400     Label ok;
7401     Register tsize = rsi;
7402     assert_different_registers(tsize, thread_reg, t1);
7403     push(tsize);
7404     movptr(tsize, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
7405     shlptr(tsize, LogHeapWordSize);
7406     cmpptr(t1, tsize);
7407     jcc(Assembler::equal, ok);
7408     stop("assert(t1 != tlab size)");
7409     should_not_reach_here();
7410 
7411     bind(ok);
7412     pop(tsize);
7413   }
7414 #endif
7415   movptr(Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())), top);
7416   movptr(Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())), top);
7417   addptr(top, t1);
7418   subptr(top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
7419   movptr(Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())), top);
7420   verify_tlab();
7421   jmp(retry);
7422 
7423   return thread_reg; // for use by caller
7424 }
7425 
7426 void MacroAssembler::incr_allocated_bytes(Register thread,
7427                                           Register var_size_in_bytes,
7428                                           int con_size_in_bytes,
7429                                           Register t1) {
7430 #ifdef _LP64
7431   if (var_size_in_bytes->is_valid()) {
7432     addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
7433   } else {
7434     addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
7435   }
7436 #else
7437   if (!thread->is_valid()) {
7438     assert(t1->is_valid(), "need temp reg");
7439     thread = t1;
7440     get_thread(thread);
7441   }
7442 
7443   if (var_size_in_bytes->is_valid()) {
7444     addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
7445   } else {
7446     addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
7447   }
7448   adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
7449 #endif
7450 }
7451 
7452 static const double     pi_4 =  0.7853981633974483;
7453 
7454 void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
7455   // A hand-coded argument reduction for values in fabs(pi/4, pi/2)
7456   // was attempted in this code; unfortunately it appears that the
7457   // switch to 80-bit precision and back causes this to be
7458   // unprofitable compared with simply performing a runtime call if
7459   // the argument is out of the (-pi/4, pi/4) range.
7460 
7461   Register tmp = noreg;
7462   if (!VM_Version::supports_cmov()) {
7463     // fcmp needs a temporary so preserve rbx,
7464     tmp = rbx;
7465     push(tmp);
7466   }
7467 
7468   Label slow_case, done;
7469 
7470   ExternalAddress pi4_adr = (address)&pi_4;
7471   if (reachable(pi4_adr)) {
7472     // x ?<= pi/4
7473     fld_d(pi4_adr);
7474     fld_s(1);                // Stack:  X  PI/4  X
7475     fabs();                  // Stack: |X| PI/4  X
7476     fcmp(tmp);
7477     jcc(Assembler::above, slow_case);
7478 
7479     // fastest case: -pi/4 <= x <= pi/4
7480     switch(trig) {
7481     case 's':
7482       fsin();
7483       break;
7484     case 'c':
7485       fcos();
7486       break;
7487     case 't':
7488       ftan();
7489       break;
7490     default:
7491       assert(false, "bad intrinsic");
7492       break;
7493     }
7494     jmp(done);
7495   }
7496 
7497   // slow case: runtime call
7498   bind(slow_case);
7499   // Preserve registers across runtime call
7500   pusha();
7501   int incoming_argument_and_return_value_offset = -1;
7502   if (num_fpu_regs_in_use > 1) {
7503     // Must preserve all other FPU regs (could alternatively convert
7504     // SharedRuntime::dsin and dcos into assembly routines known not to trash
7505     // FPU state, but can not trust C compiler)
7506     NEEDS_CLEANUP;
7507     // NOTE that in this case we also push the incoming argument to
7508     // the stack and restore it later; we also use this stack slot to
7509     // hold the return value from dsin or dcos.
7510     for (int i = 0; i < num_fpu_regs_in_use; i++) {
7511       subptr(rsp, sizeof(jdouble));
7512       fstp_d(Address(rsp, 0));
7513     }
7514     incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
7515     fld_d(Address(rsp, incoming_argument_and_return_value_offset));
7516   }
7517   subptr(rsp, sizeof(jdouble));
7518   fstp_d(Address(rsp, 0));
7519 #ifdef _LP64
7520   movdbl(xmm0, Address(rsp, 0));
7521 #endif // _LP64
7522 
7523   // NOTE: we must not use call_VM_leaf here because that requires a
7524   // complete interpreter frame in debug mode -- same bug as 4387334
7525   // MacroAssembler::call_VM_leaf_base is perfectly safe and will
7526   // do proper 64bit abi
7527 
7528   NEEDS_CLEANUP;
7529   // Need to add stack banging before this runtime call if it needs to
7530   // be taken; however, there is no generic stack banging routine at
7531   // the MacroAssembler level
7532   switch(trig) {
7533   case 's':
7534     {
7535       MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 0);
7536     }
7537     break;
7538   case 'c':
7539     {
7540       MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 0);
7541     }
7542     break;
7543   case 't':
7544     {
7545       MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 0);
7546     }
7547     break;
7548   default:
7549     assert(false, "bad intrinsic");
7550     break;
7551   }
7552 #ifdef _LP64
7553     movsd(Address(rsp, 0), xmm0);
7554     fld_d(Address(rsp, 0));
7555 #endif // _LP64
7556   addptr(rsp, sizeof(jdouble));
7557   if (num_fpu_regs_in_use > 1) {
7558     // Must save return value to stack and then restore entire FPU stack
7559     fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
7560     for (int i = 0; i < num_fpu_regs_in_use; i++) {
7561       fld_d(Address(rsp, 0));
7562       addptr(rsp, sizeof(jdouble));
7563     }
7564   }
7565   popa();
7566 
7567   // Come here with result in F-TOS
7568   bind(done);
7569 
7570   if (tmp != noreg) {
7571     pop(tmp);
7572   }
7573 }
7574 
7575 
7576 // Look up the method for a megamorphic invokeinterface call.
7577 // The target method is determined by <intf_klass, itable_index>.
7578 // The receiver klass is in recv_klass.
7579 // On success, the result will be in method_result, and execution falls through.
7580 // On failure, execution transfers to the given label.
7581 void MacroAssembler::lookup_interface_method(Register recv_klass,
7582                                              Register intf_klass,
7583                                              RegisterOrConstant itable_index,
7584                                              Register method_result,
7585                                              Register scan_temp,
7586                                              Label& L_no_such_interface) {
7587   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
7588   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
7589          "caller must use same register for non-constant itable index as for method");
7590 
7591   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
7592   int vtable_base = instanceKlass::vtable_start_offset() * wordSize;
7593   int itentry_off = itableMethodEntry::method_offset_in_bytes();
7594   int scan_step   = itableOffsetEntry::size() * wordSize;
7595   int vte_size    = vtableEntry::size() * wordSize;
7596   Address::ScaleFactor times_vte_scale = Address::times_ptr;
7597   assert(vte_size == wordSize, "else adjust times_vte_scale");
7598 
7599   movl(scan_temp, Address(recv_klass, instanceKlass::vtable_length_offset() * wordSize));
7600 
7601   // %%% Could store the aligned, prescaled offset in the klassoop.
7602   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
7603   if (HeapWordsPerLong > 1) {
7604     // Round up to align_object_offset boundary
7605     // see code for instanceKlass::start_of_itable!
7606     round_to(scan_temp, BytesPerLong);
7607   }
7608 
7609   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
7610   assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
7611   lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
7612 
7613   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
7614   //   if (scan->interface() == intf) {
7615   //     result = (klass + scan->offset() + itable_index);
7616   //   }
7617   // }
7618   Label search, found_method;
7619 
7620   for (int peel = 1; peel >= 0; peel--) {
7621     movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
7622     cmpptr(intf_klass, method_result);
7623 
7624     if (peel) {
7625       jccb(Assembler::equal, found_method);
7626     } else {
7627       jccb(Assembler::notEqual, search);
7628       // (invert the test to fall through to found_method...)
7629     }
7630 
7631     if (!peel)  break;
7632 
7633     bind(search);
7634 
7635     // Check that the previous entry is non-null.  A null entry means that
7636     // the receiver class doesn't implement the interface, and wasn't the
7637     // same as when the caller was compiled.
7638     testptr(method_result, method_result);
7639     jcc(Assembler::zero, L_no_such_interface);
7640     addptr(scan_temp, scan_step);
7641   }
7642 
7643   bind(found_method);
7644 
7645   // Got a hit.
7646   movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
7647   movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
7648 }
7649 
7650 
7651 void MacroAssembler::check_klass_subtype(Register sub_klass,
7652                            Register super_klass,
7653                            Register temp_reg,
7654                            Label& L_success) {
7655   Label L_failure;
7656   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
7657   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
7658   bind(L_failure);
7659 }
7660 
7661 
7662 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
7663                                                    Register super_klass,
7664                                                    Register temp_reg,
7665                                                    Label* L_success,
7666                                                    Label* L_failure,
7667                                                    Label* L_slow_path,
7668                                         RegisterOrConstant super_check_offset) {
7669   assert_different_registers(sub_klass, super_klass, temp_reg);
7670   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
7671   if (super_check_offset.is_register()) {
7672     assert_different_registers(sub_klass, super_klass,
7673                                super_check_offset.as_register());
7674   } else if (must_load_sco) {
7675     assert(temp_reg != noreg, "supply either a temp or a register offset");
7676   }
7677 
7678   Label L_fallthrough;
7679   int label_nulls = 0;
7680   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
7681   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
7682   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
7683   assert(label_nulls <= 1, "at most one NULL in the batch");
7684 
7685   int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
7686                    Klass::secondary_super_cache_offset_in_bytes());
7687   int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
7688                     Klass::super_check_offset_offset_in_bytes());
7689   Address super_check_offset_addr(super_klass, sco_offset);
7690 
7691   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
7692   // range of a jccb.  If this routine grows larger, reconsider at
7693   // least some of these.
7694 #define local_jcc(assembler_cond, label)                                \
7695   if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
7696   else                             jcc( assembler_cond, label) /*omit semi*/
7697 
7698   // Hacked jmp, which may only be used just before L_fallthrough.
7699 #define final_jmp(label)                                                \
7700   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
7701   else                            jmp(label)                /*omit semi*/
7702 
7703   // If the pointers are equal, we are done (e.g., String[] elements).
7704   // This self-check enables sharing of secondary supertype arrays among
7705   // non-primary types such as array-of-interface.  Otherwise, each such
7706   // type would need its own customized SSA.
7707   // We move this check to the front of the fast path because many
7708   // type checks are in fact trivially successful in this manner,
7709   // so we get a nicely predicted branch right at the start of the check.
7710   cmpptr(sub_klass, super_klass);
7711   local_jcc(Assembler::equal, *L_success);
7712 
7713   // Check the supertype display:
7714   if (must_load_sco) {
7715     // Positive movl does right thing on LP64.
7716     movl(temp_reg, super_check_offset_addr);
7717     super_check_offset = RegisterOrConstant(temp_reg);
7718   }
7719   Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
7720   cmpptr(super_klass, super_check_addr); // load displayed supertype
7721 
7722   // This check has worked decisively for primary supers.
7723   // Secondary supers are sought in the super_cache ('super_cache_addr').
7724   // (Secondary supers are interfaces and very deeply nested subtypes.)
7725   // This works in the same check above because of a tricky aliasing
7726   // between the super_cache and the primary super display elements.
7727   // (The 'super_check_addr' can address either, as the case requires.)
7728   // Note that the cache is updated below if it does not help us find
7729   // what we need immediately.
7730   // So if it was a primary super, we can just fail immediately.
7731   // Otherwise, it's the slow path for us (no success at this point).
7732 
7733   if (super_check_offset.is_register()) {
7734     local_jcc(Assembler::equal, *L_success);
7735     cmpl(super_check_offset.as_register(), sc_offset);
7736     if (L_failure == &L_fallthrough) {
7737       local_jcc(Assembler::equal, *L_slow_path);
7738     } else {
7739       local_jcc(Assembler::notEqual, *L_failure);
7740       final_jmp(*L_slow_path);
7741     }
7742   } else if (super_check_offset.as_constant() == sc_offset) {
7743     // Need a slow path; fast failure is impossible.
7744     if (L_slow_path == &L_fallthrough) {
7745       local_jcc(Assembler::equal, *L_success);
7746     } else {
7747       local_jcc(Assembler::notEqual, *L_slow_path);
7748       final_jmp(*L_success);
7749     }
7750   } else {
7751     // No slow path; it's a fast decision.
7752     if (L_failure == &L_fallthrough) {
7753       local_jcc(Assembler::equal, *L_success);
7754     } else {
7755       local_jcc(Assembler::notEqual, *L_failure);
7756       final_jmp(*L_success);
7757     }
7758   }
7759 
7760   bind(L_fallthrough);
7761 
7762 #undef local_jcc
7763 #undef final_jmp
7764 }
7765 
7766 
7767 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
7768                                                    Register super_klass,
7769                                                    Register temp_reg,
7770                                                    Register temp2_reg,
7771                                                    Label* L_success,
7772                                                    Label* L_failure,
7773                                                    bool set_cond_codes) {
7774   assert_different_registers(sub_klass, super_klass, temp_reg);
7775   if (temp2_reg != noreg)
7776     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
7777 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
7778 
7779   Label L_fallthrough;
7780   int label_nulls = 0;
7781   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
7782   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
7783   assert(label_nulls <= 1, "at most one NULL in the batch");
7784 
7785   // a couple of useful fields in sub_klass:
7786   int ss_offset = (klassOopDesc::header_size() * HeapWordSize +
7787                    Klass::secondary_supers_offset_in_bytes());
7788   int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
7789                    Klass::secondary_super_cache_offset_in_bytes());
7790   Address secondary_supers_addr(sub_klass, ss_offset);
7791   Address super_cache_addr(     sub_klass, sc_offset);
7792 
7793   // Do a linear scan of the secondary super-klass chain.
7794   // This code is rarely used, so simplicity is a virtue here.
7795   // The repne_scan instruction uses fixed registers, which we must spill.
7796   // Don't worry too much about pre-existing connections with the input regs.
7797 
7798   assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
7799   assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
7800 
7801   // Get super_klass value into rax (even if it was in rdi or rcx).
7802   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
7803   if (super_klass != rax || UseCompressedOops) {
7804     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
7805     mov(rax, super_klass);
7806   }
7807   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
7808   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
7809 
7810 #ifndef PRODUCT
7811   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
7812   ExternalAddress pst_counter_addr((address) pst_counter);
7813   NOT_LP64(  incrementl(pst_counter_addr) );
7814   LP64_ONLY( lea(rcx, pst_counter_addr) );
7815   LP64_ONLY( incrementl(Address(rcx, 0)) );
7816 #endif //PRODUCT
7817 
7818   // We will consult the secondary-super array.
7819   movptr(rdi, secondary_supers_addr);
7820   // Load the array length.  (Positive movl does right thing on LP64.)
7821   movl(rcx, Address(rdi, arrayOopDesc::length_offset_in_bytes()));
7822   // Skip to start of data.
7823   addptr(rdi, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
7824 
7825   // Scan RCX words at [RDI] for an occurrence of RAX.
7826   // Set NZ/Z based on last compare.
7827   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
7828   // not change flags (only scas instruction which is repeated sets flags).
7829   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
7830 #ifdef _LP64
7831   // This part is tricky, as values in supers array could be 32 or 64 bit wide
7832   // and we store values in objArrays always encoded, thus we need to encode
7833   // the value of rax before repne.  Note that rax is dead after the repne.
7834   if (UseCompressedOops) {
7835     encode_heap_oop_not_null(rax); // Changes flags.
7836     // The superclass is never null; it would be a basic system error if a null
7837     // pointer were to sneak in here.  Note that we have already loaded the
7838     // Klass::super_check_offset from the super_klass in the fast path,
7839     // so if there is a null in that register, we are already in the afterlife.
7840     testl(rax,rax); // Set Z = 0
7841     repne_scanl();
7842   } else
7843 #endif // _LP64
7844   {
7845     testptr(rax,rax); // Set Z = 0
7846     repne_scan();
7847   }
7848   // Unspill the temp. registers:
7849   if (pushed_rdi)  pop(rdi);
7850   if (pushed_rcx)  pop(rcx);
7851   if (pushed_rax)  pop(rax);
7852 
7853   if (set_cond_codes) {
7854     // Special hack for the AD files:  rdi is guaranteed non-zero.
7855     assert(!pushed_rdi, "rdi must be left non-NULL");
7856     // Also, the condition codes are properly set Z/NZ on succeed/failure.
7857   }
7858 
7859   if (L_failure == &L_fallthrough)
7860         jccb(Assembler::notEqual, *L_failure);
7861   else  jcc(Assembler::notEqual, *L_failure);
7862 
7863   // Success.  Cache the super we found and proceed in triumph.
7864   movptr(super_cache_addr, super_klass);
7865 
7866   if (L_success != &L_fallthrough) {
7867     jmp(*L_success);
7868   }
7869 
7870 #undef IS_A_TEMP
7871 
7872   bind(L_fallthrough);
7873 }
7874 
7875 
7876 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
7877   ucomisd(dst, as_Address(src));
7878 }
7879 
7880 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
7881   ucomiss(dst, as_Address(src));
7882 }
7883 
7884 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
7885   if (reachable(src)) {
7886     xorpd(dst, as_Address(src));
7887   } else {
7888     lea(rscratch1, src);
7889     xorpd(dst, Address(rscratch1, 0));
7890   }
7891 }
7892 
7893 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
7894   if (reachable(src)) {
7895     xorps(dst, as_Address(src));
7896   } else {
7897     lea(rscratch1, src);
7898     xorps(dst, Address(rscratch1, 0));
7899   }
7900 }
7901 
7902 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
7903   if (VM_Version::supports_cmov()) {
7904     cmovl(cc, dst, src);
7905   } else {
7906     Label L;
7907     jccb(negate_condition(cc), L);
7908     movl(dst, src);
7909     bind(L);
7910   }
7911 }
7912 
7913 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
7914   if (VM_Version::supports_cmov()) {
7915     cmovl(cc, dst, src);
7916   } else {
7917     Label L;
7918     jccb(negate_condition(cc), L);
7919     movl(dst, src);
7920     bind(L);
7921   }
7922 }
7923 
7924 void MacroAssembler::verify_oop(Register reg, const char* s) {
7925   if (!VerifyOops) return;
7926 
7927   // Pass register number to verify_oop_subroutine
7928   char* b = new char[strlen(s) + 50];
7929   sprintf(b, "verify_oop: %s: %s", reg->name(), s);
7930 #ifdef _LP64
7931   push(rscratch1);                    // save r10, trashed by movptr()
7932 #endif
7933   push(rax);                          // save rax,
7934   push(reg);                          // pass register argument
7935   ExternalAddress buffer((address) b);
7936   // avoid using pushptr, as it modifies scratch registers
7937   // and our contract is not to modify anything
7938   movptr(rax, buffer.addr());
7939   push(rax);
7940   // call indirectly to solve generation ordering problem
7941   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
7942   call(rax);
7943   // Caller pops the arguments (oop, message) and restores rax, r10
7944 }
7945 
7946 
7947 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
7948                                                       Register tmp,
7949                                                       int offset) {
7950   intptr_t value = *delayed_value_addr;
7951   if (value != 0)
7952     return RegisterOrConstant(value + offset);
7953 
7954   // load indirectly to solve generation ordering problem
7955   movptr(tmp, ExternalAddress((address) delayed_value_addr));
7956 
7957 #ifdef ASSERT
7958   { Label L;
7959     testptr(tmp, tmp);
7960     if (WizardMode) {
7961       jcc(Assembler::notZero, L);
7962       char* buf = new char[40];
7963       sprintf(buf, "DelayedValue="INTPTR_FORMAT, delayed_value_addr[1]);
7964       stop(buf);
7965     } else {
7966       jccb(Assembler::notZero, L);
7967       hlt();
7968     }
7969     bind(L);
7970   }
7971 #endif
7972 
7973   if (offset != 0)
7974     addptr(tmp, offset);
7975 
7976   return RegisterOrConstant(tmp);
7977 }
7978 
7979 
7980 // registers on entry:
7981 //  - rax ('check' register): required MethodType
7982 //  - rcx: method handle
7983 //  - rdx, rsi, or ?: killable temp
7984 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
7985                                               Register temp_reg,
7986                                               Label& wrong_method_type) {
7987   Address type_addr(mh_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg));
7988   // compare method type against that of the receiver
7989   if (UseCompressedOops) {
7990     load_heap_oop(temp_reg, type_addr);
7991     cmpptr(mtype_reg, temp_reg);
7992   } else {
7993     cmpptr(mtype_reg, type_addr);
7994   }
7995   jcc(Assembler::notEqual, wrong_method_type);
7996 }
7997 
7998 
7999 // A method handle has a "vmslots" field which gives the size of its
8000 // argument list in JVM stack slots.  This field is either located directly
8001 // in every method handle, or else is indirectly accessed through the
8002 // method handle's MethodType.  This macro hides the distinction.
8003 void MacroAssembler::load_method_handle_vmslots(Register vmslots_reg, Register mh_reg,
8004                                                 Register temp_reg) {
8005   assert_different_registers(vmslots_reg, mh_reg, temp_reg);
8006   // load mh.type.form.vmslots
8007   if (java_lang_invoke_MethodHandle::vmslots_offset_in_bytes() != 0) {
8008     // hoist vmslots into every mh to avoid dependent load chain
8009     movl(vmslots_reg, Address(mh_reg, delayed_value(java_lang_invoke_MethodHandle::vmslots_offset_in_bytes, temp_reg)));
8010   } else {
8011     Register temp2_reg = vmslots_reg;
8012     load_heap_oop(temp2_reg, Address(mh_reg,    delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg)));
8013     load_heap_oop(temp2_reg, Address(temp2_reg, delayed_value(java_lang_invoke_MethodType::form_offset_in_bytes, temp_reg)));
8014     movl(vmslots_reg, Address(temp2_reg, delayed_value(java_lang_invoke_MethodTypeForm::vmslots_offset_in_bytes, temp_reg)));
8015   }
8016 }
8017 
8018 
8019 // registers on entry:
8020 //  - rcx: method handle
8021 //  - rdx: killable temp (interpreted only)
8022 //  - rax: killable temp (compiled only)
8023 void MacroAssembler::jump_to_method_handle_entry(Register mh_reg, Register temp_reg) {
8024   assert(mh_reg == rcx, "caller must put MH object in rcx");
8025   assert_different_registers(mh_reg, temp_reg);
8026 
8027   // pick out the interpreted side of the handler
8028   // NOTE: vmentry is not an oop!
8029   movptr(temp_reg, Address(mh_reg, delayed_value(java_lang_invoke_MethodHandle::vmentry_offset_in_bytes, temp_reg)));
8030 
8031   // off we go...
8032   jmp(Address(temp_reg, MethodHandleEntry::from_interpreted_entry_offset_in_bytes()));
8033 
8034   // for the various stubs which take control at this point,
8035   // see MethodHandles::generate_method_handle_stub
8036 }
8037 
8038 
8039 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
8040                                          int extra_slot_offset) {
8041   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
8042   int stackElementSize = Interpreter::stackElementSize;
8043   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
8044 #ifdef ASSERT
8045   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
8046   assert(offset1 - offset == stackElementSize, "correct arithmetic");
8047 #endif
8048   Register             scale_reg    = noreg;
8049   Address::ScaleFactor scale_factor = Address::no_scale;
8050   if (arg_slot.is_constant()) {
8051     offset += arg_slot.as_constant() * stackElementSize;
8052   } else {
8053     scale_reg    = arg_slot.as_register();
8054     scale_factor = Address::times(stackElementSize);
8055   }
8056   offset += wordSize;           // return PC is on stack
8057   return Address(rsp, scale_reg, scale_factor, offset);
8058 }
8059 
8060 
8061 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
8062   if (!VerifyOops) return;
8063 
8064   // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
8065   // Pass register number to verify_oop_subroutine
8066   char* b = new char[strlen(s) + 50];
8067   sprintf(b, "verify_oop_addr: %s", s);
8068 
8069 #ifdef _LP64
8070   push(rscratch1);                    // save r10, trashed by movptr()
8071 #endif
8072   push(rax);                          // save rax,
8073   // addr may contain rsp so we will have to adjust it based on the push
8074   // we just did (and on 64 bit we do two pushes)
8075   // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
8076   // stores rax into addr which is backwards of what was intended.
8077   if (addr.uses(rsp)) {
8078     lea(rax, addr);
8079     pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
8080   } else {
8081     pushptr(addr);
8082   }
8083 
8084   ExternalAddress buffer((address) b);
8085   // pass msg argument
8086   // avoid using pushptr, as it modifies scratch registers
8087   // and our contract is not to modify anything
8088   movptr(rax, buffer.addr());
8089   push(rax);
8090 
8091   // call indirectly to solve generation ordering problem
8092   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
8093   call(rax);
8094   // Caller pops the arguments (addr, message) and restores rax, r10.
8095 }
8096 
8097 void MacroAssembler::verify_tlab() {
8098 #ifdef ASSERT
8099   if (UseTLAB && VerifyOops) {
8100     Label next, ok;
8101     Register t1 = rsi;
8102     Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
8103 
8104     push(t1);
8105     NOT_LP64(push(thread_reg));
8106     NOT_LP64(get_thread(thread_reg));
8107 
8108     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
8109     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
8110     jcc(Assembler::aboveEqual, next);
8111     stop("assert(top >= start)");
8112     should_not_reach_here();
8113 
8114     bind(next);
8115     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
8116     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
8117     jcc(Assembler::aboveEqual, ok);
8118     stop("assert(top <= end)");
8119     should_not_reach_here();
8120 
8121     bind(ok);
8122     NOT_LP64(pop(thread_reg));
8123     pop(t1);
8124   }
8125 #endif
8126 }
8127 
8128 class ControlWord {
8129  public:
8130   int32_t _value;
8131 
8132   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
8133   int  precision_control() const       { return  (_value >>  8) & 3      ; }
8134   bool precision() const               { return ((_value >>  5) & 1) != 0; }
8135   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
8136   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
8137   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
8138   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
8139   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
8140 
8141   void print() const {
8142     // rounding control
8143     const char* rc;
8144     switch (rounding_control()) {
8145       case 0: rc = "round near"; break;
8146       case 1: rc = "round down"; break;
8147       case 2: rc = "round up  "; break;
8148       case 3: rc = "chop      "; break;
8149     };
8150     // precision control
8151     const char* pc;
8152     switch (precision_control()) {
8153       case 0: pc = "24 bits "; break;
8154       case 1: pc = "reserved"; break;
8155       case 2: pc = "53 bits "; break;
8156       case 3: pc = "64 bits "; break;
8157     };
8158     // flags
8159     char f[9];
8160     f[0] = ' ';
8161     f[1] = ' ';
8162     f[2] = (precision   ()) ? 'P' : 'p';
8163     f[3] = (underflow   ()) ? 'U' : 'u';
8164     f[4] = (overflow    ()) ? 'O' : 'o';
8165     f[5] = (zero_divide ()) ? 'Z' : 'z';
8166     f[6] = (denormalized()) ? 'D' : 'd';
8167     f[7] = (invalid     ()) ? 'I' : 'i';
8168     f[8] = '\x0';
8169     // output
8170     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
8171   }
8172 
8173 };
8174 
8175 class StatusWord {
8176  public:
8177   int32_t _value;
8178 
8179   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
8180   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
8181   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
8182   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
8183   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
8184   int  top() const                     { return  (_value >> 11) & 7      ; }
8185   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
8186   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
8187   bool precision() const               { return ((_value >>  5) & 1) != 0; }
8188   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
8189   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
8190   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
8191   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
8192   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
8193 
8194   void print() const {
8195     // condition codes
8196     char c[5];
8197     c[0] = (C3()) ? '3' : '-';
8198     c[1] = (C2()) ? '2' : '-';
8199     c[2] = (C1()) ? '1' : '-';
8200     c[3] = (C0()) ? '0' : '-';
8201     c[4] = '\x0';
8202     // flags
8203     char f[9];
8204     f[0] = (error_status()) ? 'E' : '-';
8205     f[1] = (stack_fault ()) ? 'S' : '-';
8206     f[2] = (precision   ()) ? 'P' : '-';
8207     f[3] = (underflow   ()) ? 'U' : '-';
8208     f[4] = (overflow    ()) ? 'O' : '-';
8209     f[5] = (zero_divide ()) ? 'Z' : '-';
8210     f[6] = (denormalized()) ? 'D' : '-';
8211     f[7] = (invalid     ()) ? 'I' : '-';
8212     f[8] = '\x0';
8213     // output
8214     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
8215   }
8216 
8217 };
8218 
8219 class TagWord {
8220  public:
8221   int32_t _value;
8222 
8223   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
8224 
8225   void print() const {
8226     printf("%04x", _value & 0xFFFF);
8227   }
8228 
8229 };
8230 
8231 class FPU_Register {
8232  public:
8233   int32_t _m0;
8234   int32_t _m1;
8235   int16_t _ex;
8236 
8237   bool is_indefinite() const           {
8238     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
8239   }
8240 
8241   void print() const {
8242     char  sign = (_ex < 0) ? '-' : '+';
8243     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
8244     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
8245   };
8246 
8247 };
8248 
8249 class FPU_State {
8250  public:
8251   enum {
8252     register_size       = 10,
8253     number_of_registers =  8,
8254     register_mask       =  7
8255   };
8256 
8257   ControlWord  _control_word;
8258   StatusWord   _status_word;
8259   TagWord      _tag_word;
8260   int32_t      _error_offset;
8261   int32_t      _error_selector;
8262   int32_t      _data_offset;
8263   int32_t      _data_selector;
8264   int8_t       _register[register_size * number_of_registers];
8265 
8266   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
8267   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
8268 
8269   const char* tag_as_string(int tag) const {
8270     switch (tag) {
8271       case 0: return "valid";
8272       case 1: return "zero";
8273       case 2: return "special";
8274       case 3: return "empty";
8275     }
8276     ShouldNotReachHere();
8277     return NULL;
8278   }
8279 
8280   void print() const {
8281     // print computation registers
8282     { int t = _status_word.top();
8283       for (int i = 0; i < number_of_registers; i++) {
8284         int j = (i - t) & register_mask;
8285         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
8286         st(j)->print();
8287         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
8288       }
8289     }
8290     printf("\n");
8291     // print control registers
8292     printf("ctrl = "); _control_word.print(); printf("\n");
8293     printf("stat = "); _status_word .print(); printf("\n");
8294     printf("tags = "); _tag_word    .print(); printf("\n");
8295   }
8296 
8297 };
8298 
8299 class Flag_Register {
8300  public:
8301   int32_t _value;
8302 
8303   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
8304   bool direction() const               { return ((_value >> 10) & 1) != 0; }
8305   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
8306   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
8307   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
8308   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
8309   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
8310 
8311   void print() const {
8312     // flags
8313     char f[8];
8314     f[0] = (overflow       ()) ? 'O' : '-';
8315     f[1] = (direction      ()) ? 'D' : '-';
8316     f[2] = (sign           ()) ? 'S' : '-';
8317     f[3] = (zero           ()) ? 'Z' : '-';
8318     f[4] = (auxiliary_carry()) ? 'A' : '-';
8319     f[5] = (parity         ()) ? 'P' : '-';
8320     f[6] = (carry          ()) ? 'C' : '-';
8321     f[7] = '\x0';
8322     // output
8323     printf("%08x  flags = %s", _value, f);
8324   }
8325 
8326 };
8327 
8328 class IU_Register {
8329  public:
8330   int32_t _value;
8331 
8332   void print() const {
8333     printf("%08x  %11d", _value, _value);
8334   }
8335 
8336 };
8337 
8338 class IU_State {
8339  public:
8340   Flag_Register _eflags;
8341   IU_Register   _rdi;
8342   IU_Register   _rsi;
8343   IU_Register   _rbp;
8344   IU_Register   _rsp;
8345   IU_Register   _rbx;
8346   IU_Register   _rdx;
8347   IU_Register   _rcx;
8348   IU_Register   _rax;
8349 
8350   void print() const {
8351     // computation registers
8352     printf("rax,  = "); _rax.print(); printf("\n");
8353     printf("rbx,  = "); _rbx.print(); printf("\n");
8354     printf("rcx  = "); _rcx.print(); printf("\n");
8355     printf("rdx  = "); _rdx.print(); printf("\n");
8356     printf("rdi  = "); _rdi.print(); printf("\n");
8357     printf("rsi  = "); _rsi.print(); printf("\n");
8358     printf("rbp,  = "); _rbp.print(); printf("\n");
8359     printf("rsp  = "); _rsp.print(); printf("\n");
8360     printf("\n");
8361     // control registers
8362     printf("flgs = "); _eflags.print(); printf("\n");
8363   }
8364 };
8365 
8366 
8367 class CPU_State {
8368  public:
8369   FPU_State _fpu_state;
8370   IU_State  _iu_state;
8371 
8372   void print() const {
8373     printf("--------------------------------------------------\n");
8374     _iu_state .print();
8375     printf("\n");
8376     _fpu_state.print();
8377     printf("--------------------------------------------------\n");
8378   }
8379 
8380 };
8381 
8382 
8383 static void _print_CPU_state(CPU_State* state) {
8384   state->print();
8385 };
8386 
8387 
8388 void MacroAssembler::print_CPU_state() {
8389   push_CPU_state();
8390   push(rsp);                // pass CPU state
8391   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
8392   addptr(rsp, wordSize);       // discard argument
8393   pop_CPU_state();
8394 }
8395 
8396 
8397 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
8398   static int counter = 0;
8399   FPU_State* fs = &state->_fpu_state;
8400   counter++;
8401   // For leaf calls, only verify that the top few elements remain empty.
8402   // We only need 1 empty at the top for C2 code.
8403   if( stack_depth < 0 ) {
8404     if( fs->tag_for_st(7) != 3 ) {
8405       printf("FPR7 not empty\n");
8406       state->print();
8407       assert(false, "error");
8408       return false;
8409     }
8410     return true;                // All other stack states do not matter
8411   }
8412 
8413   assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
8414          "bad FPU control word");
8415 
8416   // compute stack depth
8417   int i = 0;
8418   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
8419   int d = i;
8420   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
8421   // verify findings
8422   if (i != FPU_State::number_of_registers) {
8423     // stack not contiguous
8424     printf("%s: stack not contiguous at ST%d\n", s, i);
8425     state->print();
8426     assert(false, "error");
8427     return false;
8428   }
8429   // check if computed stack depth corresponds to expected stack depth
8430   if (stack_depth < 0) {
8431     // expected stack depth is -stack_depth or less
8432     if (d > -stack_depth) {
8433       // too many elements on the stack
8434       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
8435       state->print();
8436       assert(false, "error");
8437       return false;
8438     }
8439   } else {
8440     // expected stack depth is stack_depth
8441     if (d != stack_depth) {
8442       // wrong stack depth
8443       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
8444       state->print();
8445       assert(false, "error");
8446       return false;
8447     }
8448   }
8449   // everything is cool
8450   return true;
8451 }
8452 
8453 
8454 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
8455   if (!VerifyFPU) return;
8456   push_CPU_state();
8457   push(rsp);                // pass CPU state
8458   ExternalAddress msg((address) s);
8459   // pass message string s
8460   pushptr(msg.addr());
8461   push(stack_depth);        // pass stack depth
8462   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
8463   addptr(rsp, 3 * wordSize);   // discard arguments
8464   // check for error
8465   { Label L;
8466     testl(rax, rax);
8467     jcc(Assembler::notZero, L);
8468     int3();                  // break if error condition
8469     bind(L);
8470   }
8471   pop_CPU_state();
8472 }
8473 
8474 void MacroAssembler::load_klass(Register dst, Register src) {
8475 #ifdef _LP64
8476   if (UseCompressedOops) {
8477     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
8478     decode_heap_oop_not_null(dst);
8479   } else
8480 #endif
8481     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
8482 }
8483 
8484 void MacroAssembler::load_prototype_header(Register dst, Register src) {
8485 #ifdef _LP64
8486   if (UseCompressedOops) {
8487     assert (Universe::heap() != NULL, "java heap should be initialized");
8488     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
8489     if (Universe::narrow_oop_shift() != 0) {
8490       assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
8491       if (LogMinObjAlignmentInBytes == Address::times_8) {
8492         movq(dst, Address(r12_heapbase, dst, Address::times_8, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
8493       } else {
8494         // OK to use shift since we don't need to preserve flags.
8495         shlq(dst, LogMinObjAlignmentInBytes);
8496         movq(dst, Address(r12_heapbase, dst, Address::times_1, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
8497       }
8498     } else {
8499       movq(dst, Address(dst, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
8500     }
8501   } else
8502 #endif
8503   {
8504     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
8505     movptr(dst, Address(dst, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
8506   }
8507 }
8508 
8509 void MacroAssembler::store_klass(Register dst, Register src) {
8510 #ifdef _LP64
8511   if (UseCompressedOops) {
8512     encode_heap_oop_not_null(src);
8513     movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
8514   } else
8515 #endif
8516     movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
8517 }
8518 
8519 void MacroAssembler::load_heap_oop(Register dst, Address src) {
8520 #ifdef _LP64
8521   if (UseCompressedOops) {
8522     movl(dst, src);
8523     decode_heap_oop(dst);
8524   } else
8525 #endif
8526     movptr(dst, src);
8527 }
8528 
8529 // Doesn't do verfication, generates fixed size code
8530 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src) {
8531 #ifdef _LP64
8532   if (UseCompressedOops) {
8533     movl(dst, src);
8534     decode_heap_oop_not_null(dst);
8535   } else
8536 #endif
8537     movptr(dst, src);
8538 }
8539 
8540 void MacroAssembler::store_heap_oop(Address dst, Register src) {
8541 #ifdef _LP64
8542   if (UseCompressedOops) {
8543     assert(!dst.uses(src), "not enough registers");
8544     encode_heap_oop(src);
8545     movl(dst, src);
8546   } else
8547 #endif
8548     movptr(dst, src);
8549 }
8550 
8551 // Used for storing NULLs.
8552 void MacroAssembler::store_heap_oop_null(Address dst) {
8553 #ifdef _LP64
8554   if (UseCompressedOops) {
8555     movl(dst, (int32_t)NULL_WORD);
8556   } else {
8557     movslq(dst, (int32_t)NULL_WORD);
8558   }
8559 #else
8560   movl(dst, (int32_t)NULL_WORD);
8561 #endif
8562 }
8563 
8564 #ifdef _LP64
8565 void MacroAssembler::store_klass_gap(Register dst, Register src) {
8566   if (UseCompressedOops) {
8567     // Store to klass gap in destination
8568     movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
8569   }
8570 }
8571 
8572 #ifdef ASSERT
8573 void MacroAssembler::verify_heapbase(const char* msg) {
8574   assert (UseCompressedOops, "should be compressed");
8575   assert (Universe::heap() != NULL, "java heap should be initialized");
8576   if (CheckCompressedOops) {
8577     Label ok;
8578     push(rscratch1); // cmpptr trashes rscratch1
8579     cmpptr(r12_heapbase, ExternalAddress((address)Universe::narrow_oop_base_addr()));
8580     jcc(Assembler::equal, ok);
8581     stop(msg);
8582     bind(ok);
8583     pop(rscratch1);
8584   }
8585 }
8586 #endif
8587 
8588 // Algorithm must match oop.inline.hpp encode_heap_oop.
8589 void MacroAssembler::encode_heap_oop(Register r) {
8590 #ifdef ASSERT
8591   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
8592 #endif
8593   verify_oop(r, "broken oop in encode_heap_oop");
8594   if (Universe::narrow_oop_base() == NULL) {
8595     if (Universe::narrow_oop_shift() != 0) {
8596       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
8597       shrq(r, LogMinObjAlignmentInBytes);
8598     }
8599     return;
8600   }
8601   testq(r, r);
8602   cmovq(Assembler::equal, r, r12_heapbase);
8603   subq(r, r12_heapbase);
8604   shrq(r, LogMinObjAlignmentInBytes);
8605 }
8606 
8607 void MacroAssembler::encode_heap_oop_not_null(Register r) {
8608 #ifdef ASSERT
8609   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
8610   if (CheckCompressedOops) {
8611     Label ok;
8612     testq(r, r);
8613     jcc(Assembler::notEqual, ok);
8614     stop("null oop passed to encode_heap_oop_not_null");
8615     bind(ok);
8616   }
8617 #endif
8618   verify_oop(r, "broken oop in encode_heap_oop_not_null");
8619   if (Universe::narrow_oop_base() != NULL) {
8620     subq(r, r12_heapbase);
8621   }
8622   if (Universe::narrow_oop_shift() != 0) {
8623     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
8624     shrq(r, LogMinObjAlignmentInBytes);
8625   }
8626 }
8627 
8628 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
8629 #ifdef ASSERT
8630   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
8631   if (CheckCompressedOops) {
8632     Label ok;
8633     testq(src, src);
8634     jcc(Assembler::notEqual, ok);
8635     stop("null oop passed to encode_heap_oop_not_null2");
8636     bind(ok);
8637   }
8638 #endif
8639   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
8640   if (dst != src) {
8641     movq(dst, src);
8642   }
8643   if (Universe::narrow_oop_base() != NULL) {
8644     subq(dst, r12_heapbase);
8645   }
8646   if (Universe::narrow_oop_shift() != 0) {
8647     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
8648     shrq(dst, LogMinObjAlignmentInBytes);
8649   }
8650 }
8651 
8652 void  MacroAssembler::decode_heap_oop(Register r) {
8653 #ifdef ASSERT
8654   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
8655 #endif
8656   if (Universe::narrow_oop_base() == NULL) {
8657     if (Universe::narrow_oop_shift() != 0) {
8658       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
8659       shlq(r, LogMinObjAlignmentInBytes);
8660     }
8661   } else {
8662     Label done;
8663     shlq(r, LogMinObjAlignmentInBytes);
8664     jccb(Assembler::equal, done);
8665     addq(r, r12_heapbase);
8666     bind(done);
8667   }
8668   verify_oop(r, "broken oop in decode_heap_oop");
8669 }
8670 
8671 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
8672   // Note: it will change flags
8673   assert (UseCompressedOops, "should only be used for compressed headers");
8674   assert (Universe::heap() != NULL, "java heap should be initialized");
8675   // Cannot assert, unverified entry point counts instructions (see .ad file)
8676   // vtableStubs also counts instructions in pd_code_size_limit.
8677   // Also do not verify_oop as this is called by verify_oop.
8678   if (Universe::narrow_oop_shift() != 0) {
8679     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
8680     shlq(r, LogMinObjAlignmentInBytes);
8681     if (Universe::narrow_oop_base() != NULL) {
8682       addq(r, r12_heapbase);
8683     }
8684   } else {
8685     assert (Universe::narrow_oop_base() == NULL, "sanity");
8686   }
8687 }
8688 
8689 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
8690   // Note: it will change flags
8691   assert (UseCompressedOops, "should only be used for compressed headers");
8692   assert (Universe::heap() != NULL, "java heap should be initialized");
8693   // Cannot assert, unverified entry point counts instructions (see .ad file)
8694   // vtableStubs also counts instructions in pd_code_size_limit.
8695   // Also do not verify_oop as this is called by verify_oop.
8696   if (Universe::narrow_oop_shift() != 0) {
8697     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
8698     if (LogMinObjAlignmentInBytes == Address::times_8) {
8699       leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
8700     } else {
8701       if (dst != src) {
8702         movq(dst, src);
8703       }
8704       shlq(dst, LogMinObjAlignmentInBytes);
8705       if (Universe::narrow_oop_base() != NULL) {
8706         addq(dst, r12_heapbase);
8707       }
8708     }
8709   } else {
8710     assert (Universe::narrow_oop_base() == NULL, "sanity");
8711     if (dst != src) {
8712       movq(dst, src);
8713     }
8714   }
8715 }
8716 
8717 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
8718   assert (UseCompressedOops, "should only be used for compressed headers");
8719   assert (Universe::heap() != NULL, "java heap should be initialized");
8720   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
8721   int oop_index = oop_recorder()->find_index(obj);
8722   RelocationHolder rspec = oop_Relocation::spec(oop_index);
8723   mov_narrow_oop(dst, oop_index, rspec);
8724 }
8725 
8726 void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
8727   assert (UseCompressedOops, "should only be used for compressed headers");
8728   assert (Universe::heap() != NULL, "java heap should be initialized");
8729   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
8730   int oop_index = oop_recorder()->find_index(obj);
8731   RelocationHolder rspec = oop_Relocation::spec(oop_index);
8732   mov_narrow_oop(dst, oop_index, rspec);
8733 }
8734 
8735 void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
8736   assert (UseCompressedOops, "should only be used for compressed headers");
8737   assert (Universe::heap() != NULL, "java heap should be initialized");
8738   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
8739   int oop_index = oop_recorder()->find_index(obj);
8740   RelocationHolder rspec = oop_Relocation::spec(oop_index);
8741   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
8742 }
8743 
8744 void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
8745   assert (UseCompressedOops, "should only be used for compressed headers");
8746   assert (Universe::heap() != NULL, "java heap should be initialized");
8747   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
8748   int oop_index = oop_recorder()->find_index(obj);
8749   RelocationHolder rspec = oop_Relocation::spec(oop_index);
8750   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
8751 }
8752 
8753 void MacroAssembler::reinit_heapbase() {
8754   if (UseCompressedOops) {
8755     movptr(r12_heapbase, ExternalAddress((address)Universe::narrow_oop_base_addr()));
8756   }
8757 }
8758 #endif // _LP64
8759 
8760 // IndexOf for constant substrings with size >= 8 chars
8761 // which don't need to be loaded through stack.
8762 void MacroAssembler::string_indexofC8(Register str1, Register str2,
8763                                       Register cnt1, Register cnt2,
8764                                       int int_cnt2,  Register result,
8765                                       XMMRegister vec, Register tmp) {
8766   assert(UseSSE42Intrinsics, "SSE4.2 is required");
8767 
8768   // This method uses pcmpestri inxtruction with bound registers
8769   //   inputs:
8770   //     xmm - substring
8771   //     rax - substring length (elements count)
8772   //     mem - scanned string
8773   //     rdx - string length (elements count)
8774   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
8775   //   outputs:
8776   //     rcx - matched index in string
8777   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
8778 
8779   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
8780         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
8781         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
8782 
8783   // Note, inline_string_indexOf() generates checks:
8784   // if (substr.count > string.count) return -1;
8785   // if (substr.count == 0) return 0;
8786   assert(int_cnt2 >= 8, "this code isused only for cnt2 >= 8 chars");
8787 
8788   // Load substring.
8789   movdqu(vec, Address(str2, 0));
8790   movl(cnt2, int_cnt2);
8791   movptr(result, str1); // string addr
8792 
8793   if (int_cnt2 > 8) {
8794     jmpb(SCAN_TO_SUBSTR);
8795 
8796     // Reload substr for rescan, this code
8797     // is executed only for large substrings (> 8 chars)
8798     bind(RELOAD_SUBSTR);
8799     movdqu(vec, Address(str2, 0));
8800     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
8801 
8802     bind(RELOAD_STR);
8803     // We came here after the beginning of the substring was
8804     // matched but the rest of it was not so we need to search
8805     // again. Start from the next element after the previous match.
8806 
8807     // cnt2 is number of substring reminding elements and
8808     // cnt1 is number of string reminding elements when cmp failed.
8809     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
8810     subl(cnt1, cnt2);
8811     addl(cnt1, int_cnt2);
8812     movl(cnt2, int_cnt2); // Now restore cnt2
8813 
8814     decrementl(cnt1);     // Shift to next element
8815     cmpl(cnt1, cnt2);
8816     jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
8817 
8818     addptr(result, 2);
8819 
8820   } // (int_cnt2 > 8)
8821 
8822   // Scan string for start of substr in 16-byte vectors
8823   bind(SCAN_TO_SUBSTR);
8824   pcmpestri(vec, Address(result, 0), 0x0d);
8825   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
8826   subl(cnt1, 8);
8827   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
8828   cmpl(cnt1, cnt2);
8829   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
8830   addptr(result, 16);
8831   jmpb(SCAN_TO_SUBSTR);
8832 
8833   // Found a potential substr
8834   bind(FOUND_CANDIDATE);
8835   // Matched whole vector if first element matched (tmp(rcx) == 0).
8836   if (int_cnt2 == 8) {
8837     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
8838   } else { // int_cnt2 > 8
8839     jccb(Assembler::overflow, FOUND_SUBSTR);
8840   }
8841   // After pcmpestri tmp(rcx) contains matched element index
8842   // Compute start addr of substr
8843   lea(result, Address(result, tmp, Address::times_2));
8844 
8845   // Make sure string is still long enough
8846   subl(cnt1, tmp);
8847   cmpl(cnt1, cnt2);
8848   if (int_cnt2 == 8) {
8849     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
8850   } else { // int_cnt2 > 8
8851     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
8852   }
8853   // Left less then substring.
8854 
8855   bind(RET_NOT_FOUND);
8856   movl(result, -1);
8857   jmpb(EXIT);
8858 
8859   if (int_cnt2 > 8) {
8860     // This code is optimized for the case when whole substring
8861     // is matched if its head is matched.
8862     bind(MATCH_SUBSTR_HEAD);
8863     pcmpestri(vec, Address(result, 0), 0x0d);
8864     // Reload only string if does not match
8865     jccb(Assembler::noOverflow, RELOAD_STR); // OF == 0
8866 
8867     Label CONT_SCAN_SUBSTR;
8868     // Compare the rest of substring (> 8 chars).
8869     bind(FOUND_SUBSTR);
8870     // First 8 chars are already matched.
8871     negptr(cnt2);
8872     addptr(cnt2, 8);
8873 
8874     bind(SCAN_SUBSTR);
8875     subl(cnt1, 8);
8876     cmpl(cnt2, -8); // Do not read beyond substring
8877     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
8878     // Back-up strings to avoid reading beyond substring:
8879     // cnt1 = cnt1 - cnt2 + 8
8880     addl(cnt1, cnt2); // cnt2 is negative
8881     addl(cnt1, 8);
8882     movl(cnt2, 8); negptr(cnt2);
8883     bind(CONT_SCAN_SUBSTR);
8884     if (int_cnt2 < (int)G) {
8885       movdqu(vec, Address(str2, cnt2, Address::times_2, int_cnt2*2));
8886       pcmpestri(vec, Address(result, cnt2, Address::times_2, int_cnt2*2), 0x0d);
8887     } else {
8888       // calculate index in register to avoid integer overflow (int_cnt2*2)
8889       movl(tmp, int_cnt2);
8890       addptr(tmp, cnt2);
8891       movdqu(vec, Address(str2, tmp, Address::times_2, 0));
8892       pcmpestri(vec, Address(result, tmp, Address::times_2, 0), 0x0d);
8893     }
8894     // Need to reload strings pointers if not matched whole vector
8895     jccb(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
8896     addptr(cnt2, 8);
8897     jccb(Assembler::negative, SCAN_SUBSTR);
8898     // Fall through if found full substring
8899 
8900   } // (int_cnt2 > 8)
8901 
8902   bind(RET_FOUND);
8903   // Found result if we matched full small substring.
8904   // Compute substr offset
8905   subptr(result, str1);
8906   shrl(result, 1); // index
8907   bind(EXIT);
8908 
8909 } // string_indexofC8
8910 
8911 // Small strings are loaded through stack if they cross page boundary.
8912 void MacroAssembler::string_indexof(Register str1, Register str2,
8913                                     Register cnt1, Register cnt2,
8914                                     int int_cnt2,  Register result,
8915                                     XMMRegister vec, Register tmp) {
8916   assert(UseSSE42Intrinsics, "SSE4.2 is required");
8917   //
8918   // int_cnt2 is length of small (< 8 chars) constant substring
8919   // or (-1) for non constant substring in which case its length
8920   // is in cnt2 register.
8921   //
8922   // Note, inline_string_indexOf() generates checks:
8923   // if (substr.count > string.count) return -1;
8924   // if (substr.count == 0) return 0;
8925   //
8926   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < 8), "should be != 0");
8927 
8928   // This method uses pcmpestri inxtruction with bound registers
8929   //   inputs:
8930   //     xmm - substring
8931   //     rax - substring length (elements count)
8932   //     mem - scanned string
8933   //     rdx - string length (elements count)
8934   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
8935   //   outputs:
8936   //     rcx - matched index in string
8937   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
8938 
8939   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
8940         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
8941         FOUND_CANDIDATE;
8942 
8943   { //========================================================
8944     // We don't know where these strings are located
8945     // and we can't read beyond them. Load them through stack.
8946     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
8947 
8948     movptr(tmp, rsp); // save old SP
8949 
8950     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
8951       if (int_cnt2 == 1) {  // One char
8952         load_unsigned_short(result, Address(str2, 0));
8953         movdl(vec, result); // move 32 bits
8954       } else if (int_cnt2 == 2) { // Two chars
8955         movdl(vec, Address(str2, 0)); // move 32 bits
8956       } else if (int_cnt2 == 4) { // Four chars
8957         movq(vec, Address(str2, 0));  // move 64 bits
8958       } else { // cnt2 = { 3, 5, 6, 7 }
8959         // Array header size is 12 bytes in 32-bit VM
8960         // + 6 bytes for 3 chars == 18 bytes,
8961         // enough space to load vec and shift.
8962         assert(HeapWordSize*typeArrayKlass::header_size() >= 12,"sanity");
8963         movdqu(vec, Address(str2, (int_cnt2*2)-16));
8964         psrldq(vec, 16-(int_cnt2*2));
8965       }
8966     } else { // not constant substring
8967       cmpl(cnt2, 8);
8968       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
8969 
8970       // We can read beyond string if srt+16 does not cross page boundary
8971       // since heaps are aligned and mapped by pages.
8972       assert(os::vm_page_size() < (int)G, "default page should be small");
8973       movl(result, str2); // We need only low 32 bits
8974       andl(result, (os::vm_page_size()-1));
8975       cmpl(result, (os::vm_page_size()-16));
8976       jccb(Assembler::belowEqual, CHECK_STR);
8977 
8978       // Move small strings to stack to allow load 16 bytes into vec.
8979       subptr(rsp, 16);
8980       int stk_offset = wordSize-2;
8981       push(cnt2);
8982 
8983       bind(COPY_SUBSTR);
8984       load_unsigned_short(result, Address(str2, cnt2, Address::times_2, -2));
8985       movw(Address(rsp, cnt2, Address::times_2, stk_offset), result);
8986       decrement(cnt2);
8987       jccb(Assembler::notZero, COPY_SUBSTR);
8988 
8989       pop(cnt2);
8990       movptr(str2, rsp);  // New substring address
8991     } // non constant
8992 
8993     bind(CHECK_STR);
8994     cmpl(cnt1, 8);
8995     jccb(Assembler::aboveEqual, BIG_STRINGS);
8996 
8997     // Check cross page boundary.
8998     movl(result, str1); // We need only low 32 bits
8999     andl(result, (os::vm_page_size()-1));
9000     cmpl(result, (os::vm_page_size()-16));
9001     jccb(Assembler::belowEqual, BIG_STRINGS);
9002 
9003     subptr(rsp, 16);
9004     int stk_offset = -2;
9005     if (int_cnt2 < 0) { // not constant
9006       push(cnt2);
9007       stk_offset += wordSize;
9008     }
9009     movl(cnt2, cnt1);
9010 
9011     bind(COPY_STR);
9012     load_unsigned_short(result, Address(str1, cnt2, Address::times_2, -2));
9013     movw(Address(rsp, cnt2, Address::times_2, stk_offset), result);
9014     decrement(cnt2);
9015     jccb(Assembler::notZero, COPY_STR);
9016 
9017     if (int_cnt2 < 0) { // not constant
9018       pop(cnt2);
9019     }
9020     movptr(str1, rsp);  // New string address
9021 
9022     bind(BIG_STRINGS);
9023     // Load substring.
9024     if (int_cnt2 < 0) { // -1
9025       movdqu(vec, Address(str2, 0));
9026       push(cnt2);       // substr count
9027       push(str2);       // substr addr
9028       push(str1);       // string addr
9029     } else {
9030       // Small (< 8 chars) constant substrings are loaded already.
9031       movl(cnt2, int_cnt2);
9032     }
9033     push(tmp);  // original SP
9034 
9035   } // Finished loading
9036 
9037   //========================================================
9038   // Start search
9039   //
9040 
9041   movptr(result, str1); // string addr
9042 
9043   if (int_cnt2  < 0) {  // Only for non constant substring
9044     jmpb(SCAN_TO_SUBSTR);
9045 
9046     // SP saved at sp+0
9047     // String saved at sp+1*wordSize
9048     // Substr saved at sp+2*wordSize
9049     // Substr count saved at sp+3*wordSize
9050 
9051     // Reload substr for rescan, this code
9052     // is executed only for large substrings (> 8 chars)
9053     bind(RELOAD_SUBSTR);
9054     movptr(str2, Address(rsp, 2*wordSize));
9055     movl(cnt2, Address(rsp, 3*wordSize));
9056     movdqu(vec, Address(str2, 0));
9057     // We came here after the beginning of the substring was
9058     // matched but the rest of it was not so we need to search
9059     // again. Start from the next element after the previous match.
9060     subptr(str1, result); // Restore counter
9061     shrl(str1, 1);
9062     addl(cnt1, str1);
9063     decrementl(cnt1);   // Shift to next element
9064     cmpl(cnt1, cnt2);
9065     jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
9066 
9067     addptr(result, 2);
9068   } // non constant
9069 
9070   // Scan string for start of substr in 16-byte vectors
9071   bind(SCAN_TO_SUBSTR);
9072   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
9073   pcmpestri(vec, Address(result, 0), 0x0d);
9074   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
9075   subl(cnt1, 8);
9076   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
9077   cmpl(cnt1, cnt2);
9078   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
9079   addptr(result, 16);
9080 
9081   bind(ADJUST_STR);
9082   cmpl(cnt1, 8); // Do not read beyond string
9083   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
9084   // Back-up string to avoid reading beyond string.
9085   lea(result, Address(result, cnt1, Address::times_2, -16));
9086   movl(cnt1, 8);
9087   jmpb(SCAN_TO_SUBSTR);
9088 
9089   // Found a potential substr
9090   bind(FOUND_CANDIDATE);
9091   // After pcmpestri tmp(rcx) contains matched element index
9092 
9093   // Make sure string is still long enough
9094   subl(cnt1, tmp);
9095   cmpl(cnt1, cnt2);
9096   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
9097   // Left less then substring.
9098 
9099   bind(RET_NOT_FOUND);
9100   movl(result, -1);
9101   jmpb(CLEANUP);
9102 
9103   bind(FOUND_SUBSTR);
9104   // Compute start addr of substr
9105   lea(result, Address(result, tmp, Address::times_2));
9106 
9107   if (int_cnt2 > 0) { // Constant substring
9108     // Repeat search for small substring (< 8 chars)
9109     // from new point without reloading substring.
9110     // Have to check that we don't read beyond string.
9111     cmpl(tmp, 8-int_cnt2);
9112     jccb(Assembler::greater, ADJUST_STR);
9113     // Fall through if matched whole substring.
9114   } else { // non constant
9115     assert(int_cnt2 == -1, "should be != 0");
9116 
9117     addl(tmp, cnt2);
9118     // Found result if we matched whole substring.
9119     cmpl(tmp, 8);
9120     jccb(Assembler::lessEqual, RET_FOUND);
9121 
9122     // Repeat search for small substring (<= 8 chars)
9123     // from new point 'str1' without reloading substring.
9124     cmpl(cnt2, 8);
9125     // Have to check that we don't read beyond string.
9126     jccb(Assembler::lessEqual, ADJUST_STR);
9127 
9128     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
9129     // Compare the rest of substring (> 8 chars).
9130     movptr(str1, result);
9131 
9132     cmpl(tmp, cnt2);
9133     // First 8 chars are already matched.
9134     jccb(Assembler::equal, CHECK_NEXT);
9135 
9136     bind(SCAN_SUBSTR);
9137     pcmpestri(vec, Address(str1, 0), 0x0d);
9138     // Need to reload strings pointers if not matched whole vector
9139     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
9140 
9141     bind(CHECK_NEXT);
9142     subl(cnt2, 8);
9143     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
9144     addptr(str1, 16);
9145     addptr(str2, 16);
9146     subl(cnt1, 8);
9147     cmpl(cnt2, 8); // Do not read beyond substring
9148     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
9149     // Back-up strings to avoid reading beyond substring.
9150     lea(str2, Address(str2, cnt2, Address::times_2, -16));
9151     lea(str1, Address(str1, cnt2, Address::times_2, -16));
9152     subl(cnt1, cnt2);
9153     movl(cnt2, 8);
9154     addl(cnt1, 8);
9155     bind(CONT_SCAN_SUBSTR);
9156     movdqu(vec, Address(str2, 0));
9157     jmpb(SCAN_SUBSTR);
9158 
9159     bind(RET_FOUND_LONG);
9160     movptr(str1, Address(rsp, wordSize));
9161   } // non constant
9162 
9163   bind(RET_FOUND);
9164   // Compute substr offset
9165   subptr(result, str1);
9166   shrl(result, 1); // index
9167 
9168   bind(CLEANUP);
9169   pop(rsp); // restore SP
9170 
9171 } // string_indexof
9172 
9173 // Compare strings.
9174 void MacroAssembler::string_compare(Register str1, Register str2,
9175                                     Register cnt1, Register cnt2, Register result,
9176                                     XMMRegister vec1) {
9177   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
9178 
9179   // Compute the minimum of the string lengths and the
9180   // difference of the string lengths (stack).
9181   // Do the conditional move stuff
9182   movl(result, cnt1);
9183   subl(cnt1, cnt2);
9184   push(cnt1);
9185   cmov32(Assembler::lessEqual, cnt2, result);
9186 
9187   // Is the minimum length zero?
9188   testl(cnt2, cnt2);
9189   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
9190 
9191   // Load first characters
9192   load_unsigned_short(result, Address(str1, 0));
9193   load_unsigned_short(cnt1, Address(str2, 0));
9194 
9195   // Compare first characters
9196   subl(result, cnt1);
9197   jcc(Assembler::notZero,  POP_LABEL);
9198   decrementl(cnt2);
9199   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
9200 
9201   {
9202     // Check after comparing first character to see if strings are equivalent
9203     Label LSkip2;
9204     // Check if the strings start at same location
9205     cmpptr(str1, str2);
9206     jccb(Assembler::notEqual, LSkip2);
9207 
9208     // Check if the length difference is zero (from stack)
9209     cmpl(Address(rsp, 0), 0x0);
9210     jcc(Assembler::equal,  LENGTH_DIFF_LABEL);
9211 
9212     // Strings might not be equivalent
9213     bind(LSkip2);
9214   }
9215 
9216   Address::ScaleFactor scale = Address::times_2;
9217   int stride = 8;
9218 
9219   // Advance to next element
9220   addptr(str1, 16/stride);
9221   addptr(str2, 16/stride);
9222 
9223   if (UseSSE42Intrinsics) {
9224     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
9225     int pcmpmask = 0x19;
9226     // Setup to compare 16-byte vectors
9227     movl(result, cnt2);
9228     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
9229     jccb(Assembler::zero, COMPARE_TAIL);
9230 
9231     lea(str1, Address(str1, result, scale));
9232     lea(str2, Address(str2, result, scale));
9233     negptr(result);
9234 
9235     // pcmpestri
9236     //   inputs:
9237     //     vec1- substring
9238     //     rax - negative string length (elements count)
9239     //     mem - scaned string
9240     //     rdx - string length (elements count)
9241     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
9242     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
9243     //   outputs:
9244     //     rcx - first mismatched element index
9245     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
9246 
9247     bind(COMPARE_WIDE_VECTORS);
9248     movdqu(vec1, Address(str1, result, scale));
9249     pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
9250     // After pcmpestri cnt1(rcx) contains mismatched element index
9251 
9252     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
9253     addptr(result, stride);
9254     subptr(cnt2, stride);
9255     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
9256 
9257     // compare wide vectors tail
9258     testl(result, result);
9259     jccb(Assembler::zero, LENGTH_DIFF_LABEL);
9260 
9261     movl(cnt2, stride);
9262     movl(result, stride);
9263     negptr(result);
9264     movdqu(vec1, Address(str1, result, scale));
9265     pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
9266     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
9267 
9268     // Mismatched characters in the vectors
9269     bind(VECTOR_NOT_EQUAL);
9270     addptr(result, cnt1);
9271     movptr(cnt2, result);
9272     load_unsigned_short(result, Address(str1, cnt2, scale));
9273     load_unsigned_short(cnt1, Address(str2, cnt2, scale));
9274     subl(result, cnt1);
9275     jmpb(POP_LABEL);
9276 
9277     bind(COMPARE_TAIL); // limit is zero
9278     movl(cnt2, result);
9279     // Fallthru to tail compare
9280   }
9281 
9282   // Shift str2 and str1 to the end of the arrays, negate min
9283   lea(str1, Address(str1, cnt2, scale, 0));
9284   lea(str2, Address(str2, cnt2, scale, 0));
9285   negptr(cnt2);
9286 
9287   // Compare the rest of the elements
9288   bind(WHILE_HEAD_LABEL);
9289   load_unsigned_short(result, Address(str1, cnt2, scale, 0));
9290   load_unsigned_short(cnt1, Address(str2, cnt2, scale, 0));
9291   subl(result, cnt1);
9292   jccb(Assembler::notZero, POP_LABEL);
9293   increment(cnt2);
9294   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
9295 
9296   // Strings are equal up to min length.  Return the length difference.
9297   bind(LENGTH_DIFF_LABEL);
9298   pop(result);
9299   jmpb(DONE_LABEL);
9300 
9301   // Discard the stored length difference
9302   bind(POP_LABEL);
9303   pop(cnt1);
9304 
9305   // That's it
9306   bind(DONE_LABEL);
9307 }
9308 
9309 // Compare char[] arrays aligned to 4 bytes or substrings.
9310 void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Register ary2,
9311                                         Register limit, Register result, Register chr,
9312                                         XMMRegister vec1, XMMRegister vec2) {
9313   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR;
9314 
9315   int length_offset  = arrayOopDesc::length_offset_in_bytes();
9316   int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
9317 
9318   // Check the input args
9319   cmpptr(ary1, ary2);
9320   jcc(Assembler::equal, TRUE_LABEL);
9321 
9322   if (is_array_equ) {
9323     // Need additional checks for arrays_equals.
9324     testptr(ary1, ary1);
9325     jcc(Assembler::zero, FALSE_LABEL);
9326     testptr(ary2, ary2);
9327     jcc(Assembler::zero, FALSE_LABEL);
9328 
9329     // Check the lengths
9330     movl(limit, Address(ary1, length_offset));
9331     cmpl(limit, Address(ary2, length_offset));
9332     jcc(Assembler::notEqual, FALSE_LABEL);
9333   }
9334 
9335   // count == 0
9336   testl(limit, limit);
9337   jcc(Assembler::zero, TRUE_LABEL);
9338 
9339   if (is_array_equ) {
9340     // Load array address
9341     lea(ary1, Address(ary1, base_offset));
9342     lea(ary2, Address(ary2, base_offset));
9343   }
9344 
9345   shll(limit, 1);      // byte count != 0
9346   movl(result, limit); // copy
9347 
9348   if (UseSSE42Intrinsics) {
9349     // With SSE4.2, use double quad vector compare
9350     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
9351 
9352     // Compare 16-byte vectors
9353     andl(result, 0x0000000e);  //   tail count (in bytes)
9354     andl(limit, 0xfffffff0);   // vector count (in bytes)
9355     jccb(Assembler::zero, COMPARE_TAIL);
9356 
9357     lea(ary1, Address(ary1, limit, Address::times_1));
9358     lea(ary2, Address(ary2, limit, Address::times_1));
9359     negptr(limit);
9360 
9361     bind(COMPARE_WIDE_VECTORS);
9362     movdqu(vec1, Address(ary1, limit, Address::times_1));
9363     movdqu(vec2, Address(ary2, limit, Address::times_1));
9364     pxor(vec1, vec2);
9365 
9366     ptest(vec1, vec1);
9367     jccb(Assembler::notZero, FALSE_LABEL);
9368     addptr(limit, 16);
9369     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
9370 
9371     testl(result, result);
9372     jccb(Assembler::zero, TRUE_LABEL);
9373 
9374     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
9375     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
9376     pxor(vec1, vec2);
9377 
9378     ptest(vec1, vec1);
9379     jccb(Assembler::notZero, FALSE_LABEL);
9380     jmpb(TRUE_LABEL);
9381 
9382     bind(COMPARE_TAIL); // limit is zero
9383     movl(limit, result);
9384     // Fallthru to tail compare
9385   }
9386 
9387   // Compare 4-byte vectors
9388   andl(limit, 0xfffffffc); // vector count (in bytes)
9389   jccb(Assembler::zero, COMPARE_CHAR);
9390 
9391   lea(ary1, Address(ary1, limit, Address::times_1));
9392   lea(ary2, Address(ary2, limit, Address::times_1));
9393   negptr(limit);
9394 
9395   bind(COMPARE_VECTORS);
9396   movl(chr, Address(ary1, limit, Address::times_1));
9397   cmpl(chr, Address(ary2, limit, Address::times_1));
9398   jccb(Assembler::notEqual, FALSE_LABEL);
9399   addptr(limit, 4);
9400   jcc(Assembler::notZero, COMPARE_VECTORS);
9401 
9402   // Compare trailing char (final 2 bytes), if any
9403   bind(COMPARE_CHAR);
9404   testl(result, 0x2);   // tail  char
9405   jccb(Assembler::zero, TRUE_LABEL);
9406   load_unsigned_short(chr, Address(ary1, 0));
9407   load_unsigned_short(limit, Address(ary2, 0));
9408   cmpl(chr, limit);
9409   jccb(Assembler::notEqual, FALSE_LABEL);
9410 
9411   bind(TRUE_LABEL);
9412   movl(result, 1);   // return true
9413   jmpb(DONE);
9414 
9415   bind(FALSE_LABEL);
9416   xorl(result, result); // return false
9417 
9418   // That's it
9419   bind(DONE);
9420 }
9421 
9422 #ifdef PRODUCT
9423 #define BLOCK_COMMENT(str) /* nothing */
9424 #else
9425 #define BLOCK_COMMENT(str) block_comment(str)
9426 #endif
9427 
9428 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
9429 void MacroAssembler::generate_fill(BasicType t, bool aligned,
9430                                    Register to, Register value, Register count,
9431                                    Register rtmp, XMMRegister xtmp) {
9432   assert_different_registers(to, value, count, rtmp);
9433   Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
9434   Label L_fill_2_bytes, L_fill_4_bytes;
9435 
9436   int shift = -1;
9437   switch (t) {
9438     case T_BYTE:
9439       shift = 2;
9440       break;
9441     case T_SHORT:
9442       shift = 1;
9443       break;
9444     case T_INT:
9445       shift = 0;
9446       break;
9447     default: ShouldNotReachHere();
9448   }
9449 
9450   if (t == T_BYTE) {
9451     andl(value, 0xff);
9452     movl(rtmp, value);
9453     shll(rtmp, 8);
9454     orl(value, rtmp);
9455   }
9456   if (t == T_SHORT) {
9457     andl(value, 0xffff);
9458   }
9459   if (t == T_BYTE || t == T_SHORT) {
9460     movl(rtmp, value);
9461     shll(rtmp, 16);
9462     orl(value, rtmp);
9463   }
9464 
9465   cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
9466   jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
9467   if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
9468     // align source address at 4 bytes address boundary
9469     if (t == T_BYTE) {
9470       // One byte misalignment happens only for byte arrays
9471       testptr(to, 1);
9472       jccb(Assembler::zero, L_skip_align1);
9473       movb(Address(to, 0), value);
9474       increment(to);
9475       decrement(count);
9476       BIND(L_skip_align1);
9477     }
9478     // Two bytes misalignment happens only for byte and short (char) arrays
9479     testptr(to, 2);
9480     jccb(Assembler::zero, L_skip_align2);
9481     movw(Address(to, 0), value);
9482     addptr(to, 2);
9483     subl(count, 1<<(shift-1));
9484     BIND(L_skip_align2);
9485   }
9486   if (UseSSE < 2) {
9487     Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
9488     // Fill 32-byte chunks
9489     subl(count, 8 << shift);
9490     jcc(Assembler::less, L_check_fill_8_bytes);
9491     align(16);
9492 
9493     BIND(L_fill_32_bytes_loop);
9494 
9495     for (int i = 0; i < 32; i += 4) {
9496       movl(Address(to, i), value);
9497     }
9498 
9499     addptr(to, 32);
9500     subl(count, 8 << shift);
9501     jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
9502     BIND(L_check_fill_8_bytes);
9503     addl(count, 8 << shift);
9504     jccb(Assembler::zero, L_exit);
9505     jmpb(L_fill_8_bytes);
9506 
9507     //
9508     // length is too short, just fill qwords
9509     //
9510     BIND(L_fill_8_bytes_loop);
9511     movl(Address(to, 0), value);
9512     movl(Address(to, 4), value);
9513     addptr(to, 8);
9514     BIND(L_fill_8_bytes);
9515     subl(count, 1 << (shift + 1));
9516     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
9517     // fall through to fill 4 bytes
9518   } else {
9519     Label L_fill_32_bytes;
9520     if (!UseUnalignedLoadStores) {
9521       // align to 8 bytes, we know we are 4 byte aligned to start
9522       testptr(to, 4);
9523       jccb(Assembler::zero, L_fill_32_bytes);
9524       movl(Address(to, 0), value);
9525       addptr(to, 4);
9526       subl(count, 1<<shift);
9527     }
9528     BIND(L_fill_32_bytes);
9529     {
9530       assert( UseSSE >= 2, "supported cpu only" );
9531       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
9532       // Fill 32-byte chunks
9533       movdl(xtmp, value);
9534       pshufd(xtmp, xtmp, 0);
9535 
9536       subl(count, 8 << shift);
9537       jcc(Assembler::less, L_check_fill_8_bytes);
9538       align(16);
9539 
9540       BIND(L_fill_32_bytes_loop);
9541 
9542       if (UseUnalignedLoadStores) {
9543         movdqu(Address(to, 0), xtmp);
9544         movdqu(Address(to, 16), xtmp);
9545       } else {
9546         movq(Address(to, 0), xtmp);
9547         movq(Address(to, 8), xtmp);
9548         movq(Address(to, 16), xtmp);
9549         movq(Address(to, 24), xtmp);
9550       }
9551 
9552       addptr(to, 32);
9553       subl(count, 8 << shift);
9554       jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
9555       BIND(L_check_fill_8_bytes);
9556       addl(count, 8 << shift);
9557       jccb(Assembler::zero, L_exit);
9558       jmpb(L_fill_8_bytes);
9559 
9560       //
9561       // length is too short, just fill qwords
9562       //
9563       BIND(L_fill_8_bytes_loop);
9564       movq(Address(to, 0), xtmp);
9565       addptr(to, 8);
9566       BIND(L_fill_8_bytes);
9567       subl(count, 1 << (shift + 1));
9568       jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
9569     }
9570   }
9571   // fill trailing 4 bytes
9572   BIND(L_fill_4_bytes);
9573   testl(count, 1<<shift);
9574   jccb(Assembler::zero, L_fill_2_bytes);
9575   movl(Address(to, 0), value);
9576   if (t == T_BYTE || t == T_SHORT) {
9577     addptr(to, 4);
9578     BIND(L_fill_2_bytes);
9579     // fill trailing 2 bytes
9580     testl(count, 1<<(shift-1));
9581     jccb(Assembler::zero, L_fill_byte);
9582     movw(Address(to, 0), value);
9583     if (t == T_BYTE) {
9584       addptr(to, 2);
9585       BIND(L_fill_byte);
9586       // fill trailing byte
9587       testl(count, 1);
9588       jccb(Assembler::zero, L_exit);
9589       movb(Address(to, 0), value);
9590     } else {
9591       BIND(L_fill_byte);
9592     }
9593   } else {
9594     BIND(L_fill_2_bytes);
9595   }
9596   BIND(L_exit);
9597 }
9598 #undef BIND
9599 #undef BLOCK_COMMENT
9600 
9601 
9602 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
9603   switch (cond) {
9604     // Note some conditions are synonyms for others
9605     case Assembler::zero:         return Assembler::notZero;
9606     case Assembler::notZero:      return Assembler::zero;
9607     case Assembler::less:         return Assembler::greaterEqual;
9608     case Assembler::lessEqual:    return Assembler::greater;
9609     case Assembler::greater:      return Assembler::lessEqual;
9610     case Assembler::greaterEqual: return Assembler::less;
9611     case Assembler::below:        return Assembler::aboveEqual;
9612     case Assembler::belowEqual:   return Assembler::above;
9613     case Assembler::above:        return Assembler::belowEqual;
9614     case Assembler::aboveEqual:   return Assembler::below;
9615     case Assembler::overflow:     return Assembler::noOverflow;
9616     case Assembler::noOverflow:   return Assembler::overflow;
9617     case Assembler::negative:     return Assembler::positive;
9618     case Assembler::positive:     return Assembler::negative;
9619     case Assembler::parity:       return Assembler::noParity;
9620     case Assembler::noParity:     return Assembler::parity;
9621   }
9622   ShouldNotReachHere(); return Assembler::overflow;
9623 }
9624 
9625 SkipIfEqual::SkipIfEqual(
9626     MacroAssembler* masm, const bool* flag_addr, bool value) {
9627   _masm = masm;
9628   _masm->cmp8(ExternalAddress((address)flag_addr), value);
9629   _masm->jcc(Assembler::equal, _label);
9630 }
9631 
9632 SkipIfEqual::~SkipIfEqual() {
9633   _masm->bind(_label);
9634 }