1 /*
   2  * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "assembler_x86.inline.hpp"
  27 #include "gc_interface/collectedHeap.inline.hpp"
  28 #include "interpreter/interpreter.hpp"
  29 #include "memory/cardTableModRefBS.hpp"
  30 #include "memory/resourceArea.hpp"
  31 #include "prims/methodHandles.hpp"
  32 #include "runtime/biasedLocking.hpp"
  33 #include "runtime/interfaceSupport.hpp"
  34 #include "runtime/objectMonitor.hpp"
  35 #include "runtime/os.hpp"
  36 #include "runtime/sharedRuntime.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #ifndef SERIALGC
  39 #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
  40 #include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
  41 #include "gc_implementation/g1/heapRegion.hpp"
  42 #endif
  43 
  44 // Implementation of AddressLiteral
  45 
  46 AddressLiteral::AddressLiteral(address target, relocInfo::relocType rtype) {
  47   _is_lval = false;
  48   _target = target;
  49   switch (rtype) {
  50   case relocInfo::oop_type:
  51     // Oops are a special case. Normally they would be their own section
  52     // but in cases like icBuffer they are literals in the code stream that
  53     // we don't have a section for. We use none so that we get a literal address
  54     // which is always patchable.
  55     break;
  56   case relocInfo::external_word_type:
  57     _rspec = external_word_Relocation::spec(target);
  58     break;
  59   case relocInfo::internal_word_type:
  60     _rspec = internal_word_Relocation::spec(target);
  61     break;
  62   case relocInfo::opt_virtual_call_type:
  63     _rspec = opt_virtual_call_Relocation::spec();
  64     break;
  65   case relocInfo::static_call_type:
  66     _rspec = static_call_Relocation::spec();
  67     break;
  68   case relocInfo::runtime_call_type:
  69     _rspec = runtime_call_Relocation::spec();
  70     break;
  71   case relocInfo::poll_type:
  72   case relocInfo::poll_return_type:
  73     _rspec = Relocation::spec_simple(rtype);
  74     break;
  75   case relocInfo::none:
  76     break;
  77   default:
  78     ShouldNotReachHere();
  79     break;
  80   }
  81 }
  82 
  83 // Implementation of Address
  84 
  85 #ifdef _LP64
  86 
  87 Address Address::make_array(ArrayAddress adr) {
  88   // Not implementable on 64bit machines
  89   // Should have been handled higher up the call chain.
  90   ShouldNotReachHere();
  91   return Address();
  92 }
  93 
  94 // exceedingly dangerous constructor
  95 Address::Address(int disp, address loc, relocInfo::relocType rtype) {
  96   _base  = noreg;
  97   _index = noreg;
  98   _scale = no_scale;
  99   _disp  = disp;
 100   switch (rtype) {
 101     case relocInfo::external_word_type:
 102       _rspec = external_word_Relocation::spec(loc);
 103       break;
 104     case relocInfo::internal_word_type:
 105       _rspec = internal_word_Relocation::spec(loc);
 106       break;
 107     case relocInfo::runtime_call_type:
 108       // HMM
 109       _rspec = runtime_call_Relocation::spec();
 110       break;
 111     case relocInfo::poll_type:
 112     case relocInfo::poll_return_type:
 113       _rspec = Relocation::spec_simple(rtype);
 114       break;
 115     case relocInfo::none:
 116       break;
 117     default:
 118       ShouldNotReachHere();
 119   }
 120 }
 121 #else // LP64
 122 
 123 Address Address::make_array(ArrayAddress adr) {
 124   AddressLiteral base = adr.base();
 125   Address index = adr.index();
 126   assert(index._disp == 0, "must not have disp"); // maybe it can?
 127   Address array(index._base, index._index, index._scale, (intptr_t) base.target());
 128   array._rspec = base._rspec;
 129   return array;
 130 }
 131 
 132 // exceedingly dangerous constructor
 133 Address::Address(address loc, RelocationHolder spec) {
 134   _base  = noreg;
 135   _index = noreg;
 136   _scale = no_scale;
 137   _disp  = (intptr_t) loc;
 138   _rspec = spec;
 139 }
 140 
 141 #endif // _LP64
 142 
 143 
 144 
 145 // Convert the raw encoding form into the form expected by the constructor for
 146 // Address.  An index of 4 (rsp) corresponds to having no index, so convert
 147 // that to noreg for the Address constructor.
 148 Address Address::make_raw(int base, int index, int scale, int disp, bool disp_is_oop) {
 149   RelocationHolder rspec;
 150   if (disp_is_oop) {
 151     rspec = Relocation::spec_simple(relocInfo::oop_type);
 152   }
 153   bool valid_index = index != rsp->encoding();
 154   if (valid_index) {
 155     Address madr(as_Register(base), as_Register(index), (Address::ScaleFactor)scale, in_ByteSize(disp));
 156     madr._rspec = rspec;
 157     return madr;
 158   } else {
 159     Address madr(as_Register(base), noreg, Address::no_scale, in_ByteSize(disp));
 160     madr._rspec = rspec;
 161     return madr;
 162   }
 163 }
 164 
 165 // Implementation of Assembler
 166 
 167 int AbstractAssembler::code_fill_byte() {
 168   return (u_char)'\xF4'; // hlt
 169 }
 170 
 171 // make this go away someday
 172 void Assembler::emit_data(jint data, relocInfo::relocType rtype, int format) {
 173   if (rtype == relocInfo::none)
 174         emit_long(data);
 175   else  emit_data(data, Relocation::spec_simple(rtype), format);
 176 }
 177 
 178 void Assembler::emit_data(jint data, RelocationHolder const& rspec, int format) {
 179   assert(imm_operand == 0, "default format must be immediate in this file");
 180   assert(inst_mark() != NULL, "must be inside InstructionMark");
 181   if (rspec.type() !=  relocInfo::none) {
 182     #ifdef ASSERT
 183       check_relocation(rspec, format);
 184     #endif
 185     // Do not use AbstractAssembler::relocate, which is not intended for
 186     // embedded words.  Instead, relocate to the enclosing instruction.
 187 
 188     // hack. call32 is too wide for mask so use disp32
 189     if (format == call32_operand)
 190       code_section()->relocate(inst_mark(), rspec, disp32_operand);
 191     else
 192       code_section()->relocate(inst_mark(), rspec, format);
 193   }
 194   emit_long(data);
 195 }
 196 
 197 static int encode(Register r) {
 198   int enc = r->encoding();
 199   if (enc >= 8) {
 200     enc -= 8;
 201   }
 202   return enc;
 203 }
 204 
 205 static int encode(XMMRegister r) {
 206   int enc = r->encoding();
 207   if (enc >= 8) {
 208     enc -= 8;
 209   }
 210   return enc;
 211 }
 212 
 213 void Assembler::emit_arith_b(int op1, int op2, Register dst, int imm8) {
 214   assert(dst->has_byte_register(), "must have byte register");
 215   assert(isByte(op1) && isByte(op2), "wrong opcode");
 216   assert(isByte(imm8), "not a byte");
 217   assert((op1 & 0x01) == 0, "should be 8bit operation");
 218   emit_byte(op1);
 219   emit_byte(op2 | encode(dst));
 220   emit_byte(imm8);
 221 }
 222 
 223 
 224 void Assembler::emit_arith(int op1, int op2, Register dst, int32_t imm32) {
 225   assert(isByte(op1) && isByte(op2), "wrong opcode");
 226   assert((op1 & 0x01) == 1, "should be 32bit operation");
 227   assert((op1 & 0x02) == 0, "sign-extension bit should not be set");
 228   if (is8bit(imm32)) {
 229     emit_byte(op1 | 0x02); // set sign bit
 230     emit_byte(op2 | encode(dst));
 231     emit_byte(imm32 & 0xFF);
 232   } else {
 233     emit_byte(op1);
 234     emit_byte(op2 | encode(dst));
 235     emit_long(imm32);
 236   }
 237 }
 238 
 239 // Force generation of a 4 byte immediate value even if it fits into 8bit
 240 void Assembler::emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32) {
 241   assert(isByte(op1) && isByte(op2), "wrong opcode");
 242   assert((op1 & 0x01) == 1, "should be 32bit operation");
 243   assert((op1 & 0x02) == 0, "sign-extension bit should not be set");
 244   emit_byte(op1);
 245   emit_byte(op2 | encode(dst));
 246   emit_long(imm32);
 247 }
 248 
 249 // immediate-to-memory forms
 250 void Assembler::emit_arith_operand(int op1, Register rm, Address adr, int32_t imm32) {
 251   assert((op1 & 0x01) == 1, "should be 32bit operation");
 252   assert((op1 & 0x02) == 0, "sign-extension bit should not be set");
 253   if (is8bit(imm32)) {
 254     emit_byte(op1 | 0x02); // set sign bit
 255     emit_operand(rm, adr, 1);
 256     emit_byte(imm32 & 0xFF);
 257   } else {
 258     emit_byte(op1);
 259     emit_operand(rm, adr, 4);
 260     emit_long(imm32);
 261   }
 262 }
 263 
 264 void Assembler::emit_arith(int op1, int op2, Register dst, jobject obj) {
 265   LP64_ONLY(ShouldNotReachHere());
 266   assert(isByte(op1) && isByte(op2), "wrong opcode");
 267   assert((op1 & 0x01) == 1, "should be 32bit operation");
 268   assert((op1 & 0x02) == 0, "sign-extension bit should not be set");
 269   InstructionMark im(this);
 270   emit_byte(op1);
 271   emit_byte(op2 | encode(dst));
 272   emit_data((intptr_t)obj, relocInfo::oop_type, 0);
 273 }
 274 
 275 
 276 void Assembler::emit_arith(int op1, int op2, Register dst, Register src) {
 277   assert(isByte(op1) && isByte(op2), "wrong opcode");
 278   emit_byte(op1);
 279   emit_byte(op2 | encode(dst) << 3 | encode(src));
 280 }
 281 
 282 
 283 void Assembler::emit_operand(Register reg, Register base, Register index,
 284                              Address::ScaleFactor scale, int disp,
 285                              RelocationHolder const& rspec,
 286                              int rip_relative_correction) {
 287   relocInfo::relocType rtype = (relocInfo::relocType) rspec.type();
 288 
 289   // Encode the registers as needed in the fields they are used in
 290 
 291   int regenc = encode(reg) << 3;
 292   int indexenc = index->is_valid() ? encode(index) << 3 : 0;
 293   int baseenc = base->is_valid() ? encode(base) : 0;
 294 
 295   if (base->is_valid()) {
 296     if (index->is_valid()) {
 297       assert(scale != Address::no_scale, "inconsistent address");
 298       // [base + index*scale + disp]
 299       if (disp == 0 && rtype == relocInfo::none  &&
 300           base != rbp LP64_ONLY(&& base != r13)) {
 301         // [base + index*scale]
 302         // [00 reg 100][ss index base]
 303         assert(index != rsp, "illegal addressing mode");
 304         emit_byte(0x04 | regenc);
 305         emit_byte(scale << 6 | indexenc | baseenc);
 306       } else if (is8bit(disp) && rtype == relocInfo::none) {
 307         // [base + index*scale + imm8]
 308         // [01 reg 100][ss index base] imm8
 309         assert(index != rsp, "illegal addressing mode");
 310         emit_byte(0x44 | regenc);
 311         emit_byte(scale << 6 | indexenc | baseenc);
 312         emit_byte(disp & 0xFF);
 313       } else {
 314         // [base + index*scale + disp32]
 315         // [10 reg 100][ss index base] disp32
 316         assert(index != rsp, "illegal addressing mode");
 317         emit_byte(0x84 | regenc);
 318         emit_byte(scale << 6 | indexenc | baseenc);
 319         emit_data(disp, rspec, disp32_operand);
 320       }
 321     } else if (base == rsp LP64_ONLY(|| base == r12)) {
 322       // [rsp + disp]
 323       if (disp == 0 && rtype == relocInfo::none) {
 324         // [rsp]
 325         // [00 reg 100][00 100 100]
 326         emit_byte(0x04 | regenc);
 327         emit_byte(0x24);
 328       } else if (is8bit(disp) && rtype == relocInfo::none) {
 329         // [rsp + imm8]
 330         // [01 reg 100][00 100 100] disp8
 331         emit_byte(0x44 | regenc);
 332         emit_byte(0x24);
 333         emit_byte(disp & 0xFF);
 334       } else {
 335         // [rsp + imm32]
 336         // [10 reg 100][00 100 100] disp32
 337         emit_byte(0x84 | regenc);
 338         emit_byte(0x24);
 339         emit_data(disp, rspec, disp32_operand);
 340       }
 341     } else {
 342       // [base + disp]
 343       assert(base != rsp LP64_ONLY(&& base != r12), "illegal addressing mode");
 344       if (disp == 0 && rtype == relocInfo::none &&
 345           base != rbp LP64_ONLY(&& base != r13)) {
 346         // [base]
 347         // [00 reg base]
 348         emit_byte(0x00 | regenc | baseenc);
 349       } else if (is8bit(disp) && rtype == relocInfo::none) {
 350         // [base + disp8]
 351         // [01 reg base] disp8
 352         emit_byte(0x40 | regenc | baseenc);
 353         emit_byte(disp & 0xFF);
 354       } else {
 355         // [base + disp32]
 356         // [10 reg base] disp32
 357         emit_byte(0x80 | regenc | baseenc);
 358         emit_data(disp, rspec, disp32_operand);
 359       }
 360     }
 361   } else {
 362     if (index->is_valid()) {
 363       assert(scale != Address::no_scale, "inconsistent address");
 364       // [index*scale + disp]
 365       // [00 reg 100][ss index 101] disp32
 366       assert(index != rsp, "illegal addressing mode");
 367       emit_byte(0x04 | regenc);
 368       emit_byte(scale << 6 | indexenc | 0x05);
 369       emit_data(disp, rspec, disp32_operand);
 370     } else if (rtype != relocInfo::none ) {
 371       // [disp] (64bit) RIP-RELATIVE (32bit) abs
 372       // [00 000 101] disp32
 373 
 374       emit_byte(0x05 | regenc);
 375       // Note that the RIP-rel. correction applies to the generated
 376       // disp field, but _not_ to the target address in the rspec.
 377 
 378       // disp was created by converting the target address minus the pc
 379       // at the start of the instruction. That needs more correction here.
 380       // intptr_t disp = target - next_ip;
 381       assert(inst_mark() != NULL, "must be inside InstructionMark");
 382       address next_ip = pc() + sizeof(int32_t) + rip_relative_correction;
 383       int64_t adjusted = disp;
 384       // Do rip-rel adjustment for 64bit
 385       LP64_ONLY(adjusted -=  (next_ip - inst_mark()));
 386       assert(is_simm32(adjusted),
 387              "must be 32bit offset (RIP relative address)");
 388       emit_data((int32_t) adjusted, rspec, disp32_operand);
 389 
 390     } else {
 391       // 32bit never did this, did everything as the rip-rel/disp code above
 392       // [disp] ABSOLUTE
 393       // [00 reg 100][00 100 101] disp32
 394       emit_byte(0x04 | regenc);
 395       emit_byte(0x25);
 396       emit_data(disp, rspec, disp32_operand);
 397     }
 398   }
 399 }
 400 
 401 void Assembler::emit_operand(XMMRegister reg, Register base, Register index,
 402                              Address::ScaleFactor scale, int disp,
 403                              RelocationHolder const& rspec) {
 404   emit_operand((Register)reg, base, index, scale, disp, rspec);
 405 }
 406 
 407 // Secret local extension to Assembler::WhichOperand:
 408 #define end_pc_operand (_WhichOperand_limit)
 409 
 410 address Assembler::locate_operand(address inst, WhichOperand which) {
 411   // Decode the given instruction, and return the address of
 412   // an embedded 32-bit operand word.
 413 
 414   // If "which" is disp32_operand, selects the displacement portion
 415   // of an effective address specifier.
 416   // If "which" is imm64_operand, selects the trailing immediate constant.
 417   // If "which" is call32_operand, selects the displacement of a call or jump.
 418   // Caller is responsible for ensuring that there is such an operand,
 419   // and that it is 32/64 bits wide.
 420 
 421   // If "which" is end_pc_operand, find the end of the instruction.
 422 
 423   address ip = inst;
 424   bool is_64bit = false;
 425 
 426   debug_only(bool has_disp32 = false);
 427   int tail_size = 0; // other random bytes (#32, #16, etc.) at end of insn
 428 
 429   again_after_prefix:
 430   switch (0xFF & *ip++) {
 431 
 432   // These convenience macros generate groups of "case" labels for the switch.
 433 #define REP4(x) (x)+0: case (x)+1: case (x)+2: case (x)+3
 434 #define REP8(x) (x)+0: case (x)+1: case (x)+2: case (x)+3: \
 435              case (x)+4: case (x)+5: case (x)+6: case (x)+7
 436 #define REP16(x) REP8((x)+0): \
 437               case REP8((x)+8)
 438 
 439   case CS_segment:
 440   case SS_segment:
 441   case DS_segment:
 442   case ES_segment:
 443   case FS_segment:
 444   case GS_segment:
 445     // Seems dubious
 446     LP64_ONLY(assert(false, "shouldn't have that prefix"));
 447     assert(ip == inst+1, "only one prefix allowed");
 448     goto again_after_prefix;
 449 
 450   case 0x67:
 451   case REX:
 452   case REX_B:
 453   case REX_X:
 454   case REX_XB:
 455   case REX_R:
 456   case REX_RB:
 457   case REX_RX:
 458   case REX_RXB:
 459     NOT_LP64(assert(false, "64bit prefixes"));
 460     goto again_after_prefix;
 461 
 462   case REX_W:
 463   case REX_WB:
 464   case REX_WX:
 465   case REX_WXB:
 466   case REX_WR:
 467   case REX_WRB:
 468   case REX_WRX:
 469   case REX_WRXB:
 470     NOT_LP64(assert(false, "64bit prefixes"));
 471     is_64bit = true;
 472     goto again_after_prefix;
 473 
 474   case 0xFF: // pushq a; decl a; incl a; call a; jmp a
 475   case 0x88: // movb a, r
 476   case 0x89: // movl a, r
 477   case 0x8A: // movb r, a
 478   case 0x8B: // movl r, a
 479   case 0x8F: // popl a
 480     debug_only(has_disp32 = true);
 481     break;
 482 
 483   case 0x68: // pushq #32
 484     if (which == end_pc_operand) {
 485       return ip + 4;
 486     }
 487     assert(which == imm_operand && !is_64bit, "pushl has no disp32 or 64bit immediate");
 488     return ip;                  // not produced by emit_operand
 489 
 490   case 0x66: // movw ... (size prefix)
 491     again_after_size_prefix2:
 492     switch (0xFF & *ip++) {
 493     case REX:
 494     case REX_B:
 495     case REX_X:
 496     case REX_XB:
 497     case REX_R:
 498     case REX_RB:
 499     case REX_RX:
 500     case REX_RXB:
 501     case REX_W:
 502     case REX_WB:
 503     case REX_WX:
 504     case REX_WXB:
 505     case REX_WR:
 506     case REX_WRB:
 507     case REX_WRX:
 508     case REX_WRXB:
 509       NOT_LP64(assert(false, "64bit prefix found"));
 510       goto again_after_size_prefix2;
 511     case 0x8B: // movw r, a
 512     case 0x89: // movw a, r
 513       debug_only(has_disp32 = true);
 514       break;
 515     case 0xC7: // movw a, #16
 516       debug_only(has_disp32 = true);
 517       tail_size = 2;  // the imm16
 518       break;
 519     case 0x0F: // several SSE/SSE2 variants
 520       ip--;    // reparse the 0x0F
 521       goto again_after_prefix;
 522     default:
 523       ShouldNotReachHere();
 524     }
 525     break;
 526 
 527   case REP8(0xB8): // movl/q r, #32/#64(oop?)
 528     if (which == end_pc_operand)  return ip + (is_64bit ? 8 : 4);
 529     // these asserts are somewhat nonsensical
 530 #ifndef _LP64
 531     assert(which == imm_operand || which == disp32_operand,
 532            err_msg("which %d is_64_bit %d ip " INTPTR_FORMAT, which, is_64bit, ip));
 533 #else
 534     assert((which == call32_operand || which == imm_operand) && is_64bit ||
 535            which == narrow_oop_operand && !is_64bit,
 536            err_msg("which %d is_64_bit %d ip " INTPTR_FORMAT, which, is_64bit, ip));
 537 #endif // _LP64
 538     return ip;
 539 
 540   case 0x69: // imul r, a, #32
 541   case 0xC7: // movl a, #32(oop?)
 542     tail_size = 4;
 543     debug_only(has_disp32 = true); // has both kinds of operands!
 544     break;
 545 
 546   case 0x0F: // movx..., etc.
 547     switch (0xFF & *ip++) {
 548     case 0x3A: // pcmpestri
 549       tail_size = 1;
 550     case 0x38: // ptest, pmovzxbw
 551       ip++; // skip opcode
 552       debug_only(has_disp32 = true); // has both kinds of operands!
 553       break;
 554 
 555     case 0x70: // pshufd r, r/a, #8
 556       debug_only(has_disp32 = true); // has both kinds of operands!
 557     case 0x73: // psrldq r, #8
 558       tail_size = 1;
 559       break;
 560 
 561     case 0x12: // movlps
 562     case 0x28: // movaps
 563     case 0x2E: // ucomiss
 564     case 0x2F: // comiss
 565     case 0x54: // andps
 566     case 0x55: // andnps
 567     case 0x56: // orps
 568     case 0x57: // xorps
 569     case 0x6E: // movd
 570     case 0x7E: // movd
 571     case 0xAE: // ldmxcsr, stmxcsr, fxrstor, fxsave, clflush
 572       debug_only(has_disp32 = true);
 573       break;
 574 
 575     case 0xAD: // shrd r, a, %cl
 576     case 0xAF: // imul r, a
 577     case 0xBE: // movsbl r, a (movsxb)
 578     case 0xBF: // movswl r, a (movsxw)
 579     case 0xB6: // movzbl r, a (movzxb)
 580     case 0xB7: // movzwl r, a (movzxw)
 581     case REP16(0x40): // cmovl cc, r, a
 582     case 0xB0: // cmpxchgb
 583     case 0xB1: // cmpxchg
 584     case 0xC1: // xaddl
 585     case 0xC7: // cmpxchg8
 586     case REP16(0x90): // setcc a
 587       debug_only(has_disp32 = true);
 588       // fall out of the switch to decode the address
 589       break;
 590 
 591     case 0xC4: // pinsrw r, a, #8
 592       debug_only(has_disp32 = true);
 593     case 0xC5: // pextrw r, r, #8
 594       tail_size = 1;  // the imm8
 595       break;
 596 
 597     case 0xAC: // shrd r, a, #8
 598       debug_only(has_disp32 = true);
 599       tail_size = 1;  // the imm8
 600       break;
 601 
 602     case REP16(0x80): // jcc rdisp32
 603       if (which == end_pc_operand)  return ip + 4;
 604       assert(which == call32_operand, "jcc has no disp32 or imm");
 605       return ip;
 606     default:
 607       ShouldNotReachHere();
 608     }
 609     break;
 610 
 611   case 0x81: // addl a, #32; addl r, #32
 612     // also: orl, adcl, sbbl, andl, subl, xorl, cmpl
 613     // on 32bit in the case of cmpl, the imm might be an oop
 614     tail_size = 4;
 615     debug_only(has_disp32 = true); // has both kinds of operands!
 616     break;
 617 
 618   case 0x83: // addl a, #8; addl r, #8
 619     // also: orl, adcl, sbbl, andl, subl, xorl, cmpl
 620     debug_only(has_disp32 = true); // has both kinds of operands!
 621     tail_size = 1;
 622     break;
 623 
 624   case 0x9B:
 625     switch (0xFF & *ip++) {
 626     case 0xD9: // fnstcw a
 627       debug_only(has_disp32 = true);
 628       break;
 629     default:
 630       ShouldNotReachHere();
 631     }
 632     break;
 633 
 634   case REP4(0x00): // addb a, r; addl a, r; addb r, a; addl r, a
 635   case REP4(0x10): // adc...
 636   case REP4(0x20): // and...
 637   case REP4(0x30): // xor...
 638   case REP4(0x08): // or...
 639   case REP4(0x18): // sbb...
 640   case REP4(0x28): // sub...
 641   case 0xF7: // mull a
 642   case 0x8D: // lea r, a
 643   case 0x87: // xchg r, a
 644   case REP4(0x38): // cmp...
 645   case 0x85: // test r, a
 646     debug_only(has_disp32 = true); // has both kinds of operands!
 647     break;
 648 
 649   case 0xC1: // sal a, #8; sar a, #8; shl a, #8; shr a, #8
 650   case 0xC6: // movb a, #8
 651   case 0x80: // cmpb a, #8
 652   case 0x6B: // imul r, a, #8
 653     debug_only(has_disp32 = true); // has both kinds of operands!
 654     tail_size = 1; // the imm8
 655     break;
 656 
 657   case 0xC4: // VEX_3bytes
 658   case 0xC5: // VEX_2bytes
 659     assert((UseAVX > 0), "shouldn't have VEX prefix");
 660     assert(ip == inst+1, "no prefixes allowed");
 661     // C4 and C5 are also used as opcodes for PINSRW and PEXTRW instructions
 662     // but they have prefix 0x0F and processed when 0x0F processed above.
 663     //
 664     // In 32-bit mode the VEX first byte C4 and C5 alias onto LDS and LES
 665     // instructions (these instructions are not supported in 64-bit mode).
 666     // To distinguish them bits [7:6] are set in the VEX second byte since
 667     // ModRM byte can not be of the form 11xxxxxx in 32-bit mode. To set
 668     // those VEX bits REX and vvvv bits are inverted.
 669     //
 670     // Fortunately C2 doesn't generate these instructions so we don't need
 671     // to check for them in product version.
 672 
 673     // Check second byte
 674     NOT_LP64(assert((0xC0 & *ip) == 0xC0, "shouldn't have LDS and LES instructions"));
 675 
 676     // First byte
 677     if ((0xFF & *inst) == VEX_3bytes) {
 678       ip++; // third byte
 679       is_64bit = ((VEX_W & *ip) == VEX_W);
 680     }
 681     ip++; // opcode
 682     // To find the end of instruction (which == end_pc_operand).
 683     switch (0xFF & *ip) {
 684     case 0x61: // pcmpestri r, r/a, #8
 685     case 0x70: // pshufd r, r/a, #8
 686     case 0x73: // psrldq r, #8
 687       tail_size = 1;  // the imm8
 688       break;
 689     default:
 690       break;
 691     }
 692     ip++; // skip opcode
 693     debug_only(has_disp32 = true); // has both kinds of operands!
 694     break;
 695 
 696   case 0xD1: // sal a, 1; sar a, 1; shl a, 1; shr a, 1
 697   case 0xD3: // sal a, %cl; sar a, %cl; shl a, %cl; shr a, %cl
 698   case 0xD9: // fld_s a; fst_s a; fstp_s a; fldcw a
 699   case 0xDD: // fld_d a; fst_d a; fstp_d a
 700   case 0xDB: // fild_s a; fistp_s a; fld_x a; fstp_x a
 701   case 0xDF: // fild_d a; fistp_d a
 702   case 0xD8: // fadd_s a; fsubr_s a; fmul_s a; fdivr_s a; fcomp_s a
 703   case 0xDC: // fadd_d a; fsubr_d a; fmul_d a; fdivr_d a; fcomp_d a
 704   case 0xDE: // faddp_d a; fsubrp_d a; fmulp_d a; fdivrp_d a; fcompp_d a
 705     debug_only(has_disp32 = true);
 706     break;
 707 
 708   case 0xE8: // call rdisp32
 709   case 0xE9: // jmp  rdisp32
 710     if (which == end_pc_operand)  return ip + 4;
 711     assert(which == call32_operand, "call has no disp32 or imm");
 712     return ip;
 713 
 714   case 0xF0:                    // Lock
 715     assert(os::is_MP(), "only on MP");
 716     goto again_after_prefix;
 717 
 718   case 0xF3:                    // For SSE
 719   case 0xF2:                    // For SSE2
 720     switch (0xFF & *ip++) {
 721     case REX:
 722     case REX_B:
 723     case REX_X:
 724     case REX_XB:
 725     case REX_R:
 726     case REX_RB:
 727     case REX_RX:
 728     case REX_RXB:
 729     case REX_W:
 730     case REX_WB:
 731     case REX_WX:
 732     case REX_WXB:
 733     case REX_WR:
 734     case REX_WRB:
 735     case REX_WRX:
 736     case REX_WRXB:
 737       NOT_LP64(assert(false, "found 64bit prefix"));
 738       ip++;
 739     default:
 740       ip++;
 741     }
 742     debug_only(has_disp32 = true); // has both kinds of operands!
 743     break;
 744 
 745   default:
 746     ShouldNotReachHere();
 747 
 748 #undef REP8
 749 #undef REP16
 750   }
 751 
 752   assert(which != call32_operand, "instruction is not a call, jmp, or jcc");
 753 #ifdef _LP64
 754   assert(which != imm_operand, "instruction is not a movq reg, imm64");
 755 #else
 756   // assert(which != imm_operand || has_imm32, "instruction has no imm32 field");
 757   assert(which != imm_operand || has_disp32, "instruction has no imm32 field");
 758 #endif // LP64
 759   assert(which != disp32_operand || has_disp32, "instruction has no disp32 field");
 760 
 761   // parse the output of emit_operand
 762   int op2 = 0xFF & *ip++;
 763   int base = op2 & 0x07;
 764   int op3 = -1;
 765   const int b100 = 4;
 766   const int b101 = 5;
 767   if (base == b100 && (op2 >> 6) != 3) {
 768     op3 = 0xFF & *ip++;
 769     base = op3 & 0x07;   // refetch the base
 770   }
 771   // now ip points at the disp (if any)
 772 
 773   switch (op2 >> 6) {
 774   case 0:
 775     // [00 reg  100][ss index base]
 776     // [00 reg  100][00   100  esp]
 777     // [00 reg base]
 778     // [00 reg  100][ss index  101][disp32]
 779     // [00 reg  101]               [disp32]
 780 
 781     if (base == b101) {
 782       if (which == disp32_operand)
 783         return ip;              // caller wants the disp32
 784       ip += 4;                  // skip the disp32
 785     }
 786     break;
 787 
 788   case 1:
 789     // [01 reg  100][ss index base][disp8]
 790     // [01 reg  100][00   100  esp][disp8]
 791     // [01 reg base]               [disp8]
 792     ip += 1;                    // skip the disp8
 793     break;
 794 
 795   case 2:
 796     // [10 reg  100][ss index base][disp32]
 797     // [10 reg  100][00   100  esp][disp32]
 798     // [10 reg base]               [disp32]
 799     if (which == disp32_operand)
 800       return ip;                // caller wants the disp32
 801     ip += 4;                    // skip the disp32
 802     break;
 803 
 804   case 3:
 805     // [11 reg base]  (not a memory addressing mode)
 806     break;
 807   }
 808 
 809   if (which == end_pc_operand) {
 810     return ip + tail_size;
 811   }
 812 
 813 #ifdef _LP64
 814   assert(which == narrow_oop_operand && !is_64bit, "instruction is not a movl adr, imm32");
 815 #else
 816   assert(which == imm_operand, "instruction has only an imm field");
 817 #endif // LP64
 818   return ip;
 819 }
 820 
 821 address Assembler::locate_next_instruction(address inst) {
 822   // Secretly share code with locate_operand:
 823   return locate_operand(inst, end_pc_operand);
 824 }
 825 
 826 
 827 #ifdef ASSERT
 828 void Assembler::check_relocation(RelocationHolder const& rspec, int format) {
 829   address inst = inst_mark();
 830   assert(inst != NULL && inst < pc(), "must point to beginning of instruction");
 831   address opnd;
 832 
 833   Relocation* r = rspec.reloc();
 834   if (r->type() == relocInfo::none) {
 835     return;
 836   } else if (r->is_call() || format == call32_operand) {
 837     // assert(format == imm32_operand, "cannot specify a nonzero format");
 838     opnd = locate_operand(inst, call32_operand);
 839   } else if (r->is_data()) {
 840     assert(format == imm_operand || format == disp32_operand
 841            LP64_ONLY(|| format == narrow_oop_operand), "format ok");
 842     opnd = locate_operand(inst, (WhichOperand)format);
 843   } else {
 844     assert(format == imm_operand, "cannot specify a format");
 845     return;
 846   }
 847   assert(opnd == pc(), "must put operand where relocs can find it");
 848 }
 849 #endif // ASSERT
 850 
 851 void Assembler::emit_operand32(Register reg, Address adr) {
 852   assert(reg->encoding() < 8, "no extended registers");
 853   assert(!adr.base_needs_rex() && !adr.index_needs_rex(), "no extended registers");
 854   emit_operand(reg, adr._base, adr._index, adr._scale, adr._disp,
 855                adr._rspec);
 856 }
 857 
 858 void Assembler::emit_operand(Register reg, Address adr,
 859                              int rip_relative_correction) {
 860   emit_operand(reg, adr._base, adr._index, adr._scale, adr._disp,
 861                adr._rspec,
 862                rip_relative_correction);
 863 }
 864 
 865 void Assembler::emit_operand(XMMRegister reg, Address adr) {
 866   emit_operand(reg, adr._base, adr._index, adr._scale, adr._disp,
 867                adr._rspec);
 868 }
 869 
 870 // MMX operations
 871 void Assembler::emit_operand(MMXRegister reg, Address adr) {
 872   assert(!adr.base_needs_rex() && !adr.index_needs_rex(), "no extended registers");
 873   emit_operand((Register)reg, adr._base, adr._index, adr._scale, adr._disp, adr._rspec);
 874 }
 875 
 876 // work around gcc (3.2.1-7a) bug
 877 void Assembler::emit_operand(Address adr, MMXRegister reg) {
 878   assert(!adr.base_needs_rex() && !adr.index_needs_rex(), "no extended registers");
 879   emit_operand((Register)reg, adr._base, adr._index, adr._scale, adr._disp, adr._rspec);
 880 }
 881 
 882 
 883 void Assembler::emit_farith(int b1, int b2, int i) {
 884   assert(isByte(b1) && isByte(b2), "wrong opcode");
 885   assert(0 <= i &&  i < 8, "illegal stack offset");
 886   emit_byte(b1);
 887   emit_byte(b2 + i);
 888 }
 889 
 890 
 891 // Now the Assembler instructions (identical for 32/64 bits)
 892 
 893 void Assembler::adcl(Address dst, int32_t imm32) {
 894   InstructionMark im(this);
 895   prefix(dst);
 896   emit_arith_operand(0x81, rdx, dst, imm32);
 897 }
 898 
 899 void Assembler::adcl(Address dst, Register src) {
 900   InstructionMark im(this);
 901   prefix(dst, src);
 902   emit_byte(0x11);
 903   emit_operand(src, dst);
 904 }
 905 
 906 void Assembler::adcl(Register dst, int32_t imm32) {
 907   prefix(dst);
 908   emit_arith(0x81, 0xD0, dst, imm32);
 909 }
 910 
 911 void Assembler::adcl(Register dst, Address src) {
 912   InstructionMark im(this);
 913   prefix(src, dst);
 914   emit_byte(0x13);
 915   emit_operand(dst, src);
 916 }
 917 
 918 void Assembler::adcl(Register dst, Register src) {
 919   (void) prefix_and_encode(dst->encoding(), src->encoding());
 920   emit_arith(0x13, 0xC0, dst, src);
 921 }
 922 
 923 void Assembler::addl(Address dst, int32_t imm32) {
 924   InstructionMark im(this);
 925   prefix(dst);
 926   emit_arith_operand(0x81, rax, dst, imm32);
 927 }
 928 
 929 void Assembler::addl(Address dst, Register src) {
 930   InstructionMark im(this);
 931   prefix(dst, src);
 932   emit_byte(0x01);
 933   emit_operand(src, dst);
 934 }
 935 
 936 void Assembler::addl(Register dst, int32_t imm32) {
 937   prefix(dst);
 938   emit_arith(0x81, 0xC0, dst, imm32);
 939 }
 940 
 941 void Assembler::addl(Register dst, Address src) {
 942   InstructionMark im(this);
 943   prefix(src, dst);
 944   emit_byte(0x03);
 945   emit_operand(dst, src);
 946 }
 947 
 948 void Assembler::addl(Register dst, Register src) {
 949   (void) prefix_and_encode(dst->encoding(), src->encoding());
 950   emit_arith(0x03, 0xC0, dst, src);
 951 }
 952 
 953 void Assembler::addr_nop_4() {
 954   assert(UseAddressNop, "no CPU support");
 955   // 4 bytes: NOP DWORD PTR [EAX+0]
 956   emit_byte(0x0F);
 957   emit_byte(0x1F);
 958   emit_byte(0x40); // emit_rm(cbuf, 0x1, EAX_enc, EAX_enc);
 959   emit_byte(0);    // 8-bits offset (1 byte)
 960 }
 961 
 962 void Assembler::addr_nop_5() {
 963   assert(UseAddressNop, "no CPU support");
 964   // 5 bytes: NOP DWORD PTR [EAX+EAX*0+0] 8-bits offset
 965   emit_byte(0x0F);
 966   emit_byte(0x1F);
 967   emit_byte(0x44); // emit_rm(cbuf, 0x1, EAX_enc, 0x4);
 968   emit_byte(0x00); // emit_rm(cbuf, 0x0, EAX_enc, EAX_enc);
 969   emit_byte(0);    // 8-bits offset (1 byte)
 970 }
 971 
 972 void Assembler::addr_nop_7() {
 973   assert(UseAddressNop, "no CPU support");
 974   // 7 bytes: NOP DWORD PTR [EAX+0] 32-bits offset
 975   emit_byte(0x0F);
 976   emit_byte(0x1F);
 977   emit_byte(0x80); // emit_rm(cbuf, 0x2, EAX_enc, EAX_enc);
 978   emit_long(0);    // 32-bits offset (4 bytes)
 979 }
 980 
 981 void Assembler::addr_nop_8() {
 982   assert(UseAddressNop, "no CPU support");
 983   // 8 bytes: NOP DWORD PTR [EAX+EAX*0+0] 32-bits offset
 984   emit_byte(0x0F);
 985   emit_byte(0x1F);
 986   emit_byte(0x84); // emit_rm(cbuf, 0x2, EAX_enc, 0x4);
 987   emit_byte(0x00); // emit_rm(cbuf, 0x0, EAX_enc, EAX_enc);
 988   emit_long(0);    // 32-bits offset (4 bytes)
 989 }
 990 
 991 void Assembler::addsd(XMMRegister dst, XMMRegister src) {
 992   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
 993   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
 994   emit_byte(0x58);
 995   emit_byte(0xC0 | encode);
 996 }
 997 
 998 void Assembler::addsd(XMMRegister dst, Address src) {
 999   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1000   InstructionMark im(this);
1001   simd_prefix(dst, dst, src, VEX_SIMD_F2);
1002   emit_byte(0x58);
1003   emit_operand(dst, src);
1004 }
1005 
1006 void Assembler::addss(XMMRegister dst, XMMRegister src) {
1007   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1008   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
1009   emit_byte(0x58);
1010   emit_byte(0xC0 | encode);
1011 }
1012 
1013 void Assembler::addss(XMMRegister dst, Address src) {
1014   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1015   InstructionMark im(this);
1016   simd_prefix(dst, dst, src, VEX_SIMD_F3);
1017   emit_byte(0x58);
1018   emit_operand(dst, src);
1019 }
1020 
1021 void Assembler::andl(Address dst, int32_t imm32) {
1022   InstructionMark im(this);
1023   prefix(dst);
1024   emit_byte(0x81);
1025   emit_operand(rsp, dst, 4);
1026   emit_long(imm32);
1027 }
1028 
1029 void Assembler::andl(Register dst, int32_t imm32) {
1030   prefix(dst);
1031   emit_arith(0x81, 0xE0, dst, imm32);
1032 }
1033 
1034 void Assembler::andl(Register dst, Address src) {
1035   InstructionMark im(this);
1036   prefix(src, dst);
1037   emit_byte(0x23);
1038   emit_operand(dst, src);
1039 }
1040 
1041 void Assembler::andl(Register dst, Register src) {
1042   (void) prefix_and_encode(dst->encoding(), src->encoding());
1043   emit_arith(0x23, 0xC0, dst, src);
1044 }
1045 
1046 void Assembler::andpd(XMMRegister dst, Address src) {
1047   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1048   InstructionMark im(this);
1049   simd_prefix(dst, dst, src, VEX_SIMD_66);
1050   emit_byte(0x54);
1051   emit_operand(dst, src);
1052 }
1053 
1054 void Assembler::andpd(XMMRegister dst, XMMRegister src) {
1055   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1056   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
1057   emit_byte(0x54);
1058   emit_byte(0xC0 | encode);
1059 }
1060 
1061 void Assembler::andps(XMMRegister dst, Address src) {
1062   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1063   InstructionMark im(this);
1064   simd_prefix(dst, dst, src, VEX_SIMD_NONE);
1065   emit_byte(0x54);
1066   emit_operand(dst, src);
1067 }
1068 
1069 void Assembler::andps(XMMRegister dst, XMMRegister src) {
1070   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1071   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_NONE);
1072   emit_byte(0x54);
1073   emit_byte(0xC0 | encode);
1074 }
1075 
1076 void Assembler::bsfl(Register dst, Register src) {
1077   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1078   emit_byte(0x0F);
1079   emit_byte(0xBC);
1080   emit_byte(0xC0 | encode);
1081 }
1082 
1083 void Assembler::bsrl(Register dst, Register src) {
1084   assert(!VM_Version::supports_lzcnt(), "encoding is treated as LZCNT");
1085   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1086   emit_byte(0x0F);
1087   emit_byte(0xBD);
1088   emit_byte(0xC0 | encode);
1089 }
1090 
1091 void Assembler::bswapl(Register reg) { // bswap
1092   int encode = prefix_and_encode(reg->encoding());
1093   emit_byte(0x0F);
1094   emit_byte(0xC8 | encode);
1095 }
1096 
1097 void Assembler::call(Label& L, relocInfo::relocType rtype) {
1098   // suspect disp32 is always good
1099   int operand = LP64_ONLY(disp32_operand) NOT_LP64(imm_operand);
1100 
1101   if (L.is_bound()) {
1102     const int long_size = 5;
1103     int offs = (int)( target(L) - pc() );
1104     assert(offs <= 0, "assembler error");
1105     InstructionMark im(this);
1106     // 1110 1000 #32-bit disp
1107     emit_byte(0xE8);
1108     emit_data(offs - long_size, rtype, operand);
1109   } else {
1110     InstructionMark im(this);
1111     // 1110 1000 #32-bit disp
1112     L.add_patch_at(code(), locator());
1113 
1114     emit_byte(0xE8);
1115     emit_data(int(0), rtype, operand);
1116   }
1117 }
1118 
1119 void Assembler::call(Register dst) {
1120   int encode = prefix_and_encode(dst->encoding());
1121   emit_byte(0xFF);
1122   emit_byte(0xD0 | encode);
1123 }
1124 
1125 
1126 void Assembler::call(Address adr) {
1127   InstructionMark im(this);
1128   prefix(adr);
1129   emit_byte(0xFF);
1130   emit_operand(rdx, adr);
1131 }
1132 
1133 void Assembler::call_literal(address entry, RelocationHolder const& rspec) {
1134   assert(entry != NULL, "call most probably wrong");
1135   InstructionMark im(this);
1136   emit_byte(0xE8);
1137   intptr_t disp = entry - (_code_pos + sizeof(int32_t));
1138   assert(is_simm32(disp), "must be 32bit offset (call2)");
1139   // Technically, should use call32_operand, but this format is
1140   // implied by the fact that we're emitting a call instruction.
1141 
1142   int operand = LP64_ONLY(disp32_operand) NOT_LP64(call32_operand);
1143   emit_data((int) disp, rspec, operand);
1144 }
1145 
1146 void Assembler::cdql() {
1147   emit_byte(0x99);
1148 }
1149 
1150 void Assembler::cmovl(Condition cc, Register dst, Register src) {
1151   NOT_LP64(guarantee(VM_Version::supports_cmov(), "illegal instruction"));
1152   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1153   emit_byte(0x0F);
1154   emit_byte(0x40 | cc);
1155   emit_byte(0xC0 | encode);
1156 }
1157 
1158 
1159 void Assembler::cmovl(Condition cc, Register dst, Address src) {
1160   NOT_LP64(guarantee(VM_Version::supports_cmov(), "illegal instruction"));
1161   prefix(src, dst);
1162   emit_byte(0x0F);
1163   emit_byte(0x40 | cc);
1164   emit_operand(dst, src);
1165 }
1166 
1167 void Assembler::cmpb(Address dst, int imm8) {
1168   InstructionMark im(this);
1169   prefix(dst);
1170   emit_byte(0x80);
1171   emit_operand(rdi, dst, 1);
1172   emit_byte(imm8);
1173 }
1174 
1175 void Assembler::cmpl(Address dst, int32_t imm32) {
1176   InstructionMark im(this);
1177   prefix(dst);
1178   emit_byte(0x81);
1179   emit_operand(rdi, dst, 4);
1180   emit_long(imm32);
1181 }
1182 
1183 void Assembler::cmpl(Register dst, int32_t imm32) {
1184   prefix(dst);
1185   emit_arith(0x81, 0xF8, dst, imm32);
1186 }
1187 
1188 void Assembler::cmpl(Register dst, Register src) {
1189   (void) prefix_and_encode(dst->encoding(), src->encoding());
1190   emit_arith(0x3B, 0xC0, dst, src);
1191 }
1192 
1193 
1194 void Assembler::cmpl(Register dst, Address  src) {
1195   InstructionMark im(this);
1196   prefix(src, dst);
1197   emit_byte(0x3B);
1198   emit_operand(dst, src);
1199 }
1200 
1201 void Assembler::cmpw(Address dst, int imm16) {
1202   InstructionMark im(this);
1203   assert(!dst.base_needs_rex() && !dst.index_needs_rex(), "no extended registers");
1204   emit_byte(0x66);
1205   emit_byte(0x81);
1206   emit_operand(rdi, dst, 2);
1207   emit_word(imm16);
1208 }
1209 
1210 // The 32-bit cmpxchg compares the value at adr with the contents of rax,
1211 // and stores reg into adr if so; otherwise, the value at adr is loaded into rax,.
1212 // The ZF is set if the compared values were equal, and cleared otherwise.
1213 void Assembler::cmpxchgl(Register reg, Address adr) { // cmpxchg
1214   if (Atomics & 2) {
1215      // caveat: no instructionmark, so this isn't relocatable.
1216      // Emit a synthetic, non-atomic, CAS equivalent.
1217      // Beware.  The synthetic form sets all ICCs, not just ZF.
1218      // cmpxchg r,[m] is equivalent to rax, = CAS (m, rax, r)
1219      cmpl(rax, adr);
1220      movl(rax, adr);
1221      if (reg != rax) {
1222         Label L ;
1223         jcc(Assembler::notEqual, L);
1224         movl(adr, reg);
1225         bind(L);
1226      }
1227   } else {
1228      InstructionMark im(this);
1229      prefix(adr, reg);
1230      emit_byte(0x0F);
1231      emit_byte(0xB1);
1232      emit_operand(reg, adr);
1233   }
1234 }
1235 
1236 void Assembler::comisd(XMMRegister dst, Address src) {
1237   // NOTE: dbx seems to decode this as comiss even though the
1238   // 0x66 is there. Strangly ucomisd comes out correct
1239   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1240   InstructionMark im(this);
1241   simd_prefix(dst, src, VEX_SIMD_66);
1242   emit_byte(0x2F);
1243   emit_operand(dst, src);
1244 }
1245 
1246 void Assembler::comisd(XMMRegister dst, XMMRegister src) {
1247   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1248   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
1249   emit_byte(0x2F);
1250   emit_byte(0xC0 | encode);
1251 }
1252 
1253 void Assembler::comiss(XMMRegister dst, Address src) {
1254   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1255   InstructionMark im(this);
1256   simd_prefix(dst, src, VEX_SIMD_NONE);
1257   emit_byte(0x2F);
1258   emit_operand(dst, src);
1259 }
1260 
1261 void Assembler::comiss(XMMRegister dst, XMMRegister src) {
1262   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1263   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
1264   emit_byte(0x2F);
1265   emit_byte(0xC0 | encode);
1266 }
1267 
1268 void Assembler::cvtdq2pd(XMMRegister dst, XMMRegister src) {
1269   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1270   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3);
1271   emit_byte(0xE6);
1272   emit_byte(0xC0 | encode);
1273 }
1274 
1275 void Assembler::cvtdq2ps(XMMRegister dst, XMMRegister src) {
1276   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1277   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
1278   emit_byte(0x5B);
1279   emit_byte(0xC0 | encode);
1280 }
1281 
1282 void Assembler::cvtsd2ss(XMMRegister dst, XMMRegister src) {
1283   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1284   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
1285   emit_byte(0x5A);
1286   emit_byte(0xC0 | encode);
1287 }
1288 
1289 void Assembler::cvtsd2ss(XMMRegister dst, Address src) {
1290   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1291   InstructionMark im(this);
1292   simd_prefix(dst, dst, src, VEX_SIMD_F2);
1293   emit_byte(0x5A);
1294   emit_operand(dst, src);
1295 }
1296 
1297 void Assembler::cvtsi2sdl(XMMRegister dst, Register src) {
1298   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1299   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
1300   emit_byte(0x2A);
1301   emit_byte(0xC0 | encode);
1302 }
1303 
1304 void Assembler::cvtsi2sdl(XMMRegister dst, Address src) {
1305   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1306   InstructionMark im(this);
1307   simd_prefix(dst, dst, src, VEX_SIMD_F2);
1308   emit_byte(0x2A);
1309   emit_operand(dst, src);
1310 }
1311 
1312 void Assembler::cvtsi2ssl(XMMRegister dst, Register src) {
1313   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1314   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
1315   emit_byte(0x2A);
1316   emit_byte(0xC0 | encode);
1317 }
1318 
1319 void Assembler::cvtsi2ssl(XMMRegister dst, Address src) {
1320   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1321   InstructionMark im(this);
1322   simd_prefix(dst, dst, src, VEX_SIMD_F3);
1323   emit_byte(0x2A);
1324   emit_operand(dst, src);
1325 }
1326 
1327 void Assembler::cvtss2sd(XMMRegister dst, XMMRegister src) {
1328   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1329   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
1330   emit_byte(0x5A);
1331   emit_byte(0xC0 | encode);
1332 }
1333 
1334 void Assembler::cvtss2sd(XMMRegister dst, Address src) {
1335   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1336   InstructionMark im(this);
1337   simd_prefix(dst, dst, src, VEX_SIMD_F3);
1338   emit_byte(0x5A);
1339   emit_operand(dst, src);
1340 }
1341 
1342 
1343 void Assembler::cvttsd2sil(Register dst, XMMRegister src) {
1344   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1345   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2);
1346   emit_byte(0x2C);
1347   emit_byte(0xC0 | encode);
1348 }
1349 
1350 void Assembler::cvttss2sil(Register dst, XMMRegister src) {
1351   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1352   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3);
1353   emit_byte(0x2C);
1354   emit_byte(0xC0 | encode);
1355 }
1356 
1357 void Assembler::decl(Address dst) {
1358   // Don't use it directly. Use MacroAssembler::decrement() instead.
1359   InstructionMark im(this);
1360   prefix(dst);
1361   emit_byte(0xFF);
1362   emit_operand(rcx, dst);
1363 }
1364 
1365 void Assembler::divsd(XMMRegister dst, Address src) {
1366   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1367   InstructionMark im(this);
1368   simd_prefix(dst, dst, src, VEX_SIMD_F2);
1369   emit_byte(0x5E);
1370   emit_operand(dst, src);
1371 }
1372 
1373 void Assembler::divsd(XMMRegister dst, XMMRegister src) {
1374   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1375   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
1376   emit_byte(0x5E);
1377   emit_byte(0xC0 | encode);
1378 }
1379 
1380 void Assembler::divss(XMMRegister dst, Address src) {
1381   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1382   InstructionMark im(this);
1383   simd_prefix(dst, dst, src, VEX_SIMD_F3);
1384   emit_byte(0x5E);
1385   emit_operand(dst, src);
1386 }
1387 
1388 void Assembler::divss(XMMRegister dst, XMMRegister src) {
1389   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1390   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
1391   emit_byte(0x5E);
1392   emit_byte(0xC0 | encode);
1393 }
1394 
1395 void Assembler::emms() {
1396   NOT_LP64(assert(VM_Version::supports_mmx(), ""));
1397   emit_byte(0x0F);
1398   emit_byte(0x77);
1399 }
1400 
1401 void Assembler::hlt() {
1402   emit_byte(0xF4);
1403 }
1404 
1405 void Assembler::idivl(Register src) {
1406   int encode = prefix_and_encode(src->encoding());
1407   emit_byte(0xF7);
1408   emit_byte(0xF8 | encode);
1409 }
1410 
1411 void Assembler::divl(Register src) { // Unsigned
1412   int encode = prefix_and_encode(src->encoding());
1413   emit_byte(0xF7);
1414   emit_byte(0xF0 | encode);
1415 }
1416 
1417 void Assembler::imull(Register dst, Register src) {
1418   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1419   emit_byte(0x0F);
1420   emit_byte(0xAF);
1421   emit_byte(0xC0 | encode);
1422 }
1423 
1424 
1425 void Assembler::imull(Register dst, Register src, int value) {
1426   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1427   if (is8bit(value)) {
1428     emit_byte(0x6B);
1429     emit_byte(0xC0 | encode);
1430     emit_byte(value & 0xFF);
1431   } else {
1432     emit_byte(0x69);
1433     emit_byte(0xC0 | encode);
1434     emit_long(value);
1435   }
1436 }
1437 
1438 void Assembler::incl(Address dst) {
1439   // Don't use it directly. Use MacroAssembler::increment() instead.
1440   InstructionMark im(this);
1441   prefix(dst);
1442   emit_byte(0xFF);
1443   emit_operand(rax, dst);
1444 }
1445 
1446 void Assembler::jcc(Condition cc, Label& L, bool maybe_short) {
1447   InstructionMark im(this);
1448   assert((0 <= cc) && (cc < 16), "illegal cc");
1449   if (L.is_bound()) {
1450     address dst = target(L);
1451     assert(dst != NULL, "jcc most probably wrong");
1452 
1453     const int short_size = 2;
1454     const int long_size = 6;
1455     intptr_t offs = (intptr_t)dst - (intptr_t)_code_pos;
1456     if (maybe_short && is8bit(offs - short_size)) {
1457       // 0111 tttn #8-bit disp
1458       emit_byte(0x70 | cc);
1459       emit_byte((offs - short_size) & 0xFF);
1460     } else {
1461       // 0000 1111 1000 tttn #32-bit disp
1462       assert(is_simm32(offs - long_size),
1463              "must be 32bit offset (call4)");
1464       emit_byte(0x0F);
1465       emit_byte(0x80 | cc);
1466       emit_long(offs - long_size);
1467     }
1468   } else {
1469     // Note: could eliminate cond. jumps to this jump if condition
1470     //       is the same however, seems to be rather unlikely case.
1471     // Note: use jccb() if label to be bound is very close to get
1472     //       an 8-bit displacement
1473     L.add_patch_at(code(), locator());
1474     emit_byte(0x0F);
1475     emit_byte(0x80 | cc);
1476     emit_long(0);
1477   }
1478 }
1479 
1480 void Assembler::jccb(Condition cc, Label& L) {
1481   if (L.is_bound()) {
1482     const int short_size = 2;
1483     address entry = target(L);
1484 #ifdef ASSERT
1485     intptr_t dist = (intptr_t)entry - ((intptr_t)_code_pos + short_size);
1486     intptr_t delta = short_branch_delta();
1487     if (delta != 0) {
1488       dist += (dist < 0 ? (-delta) :delta);
1489     }
1490     assert(is8bit(dist), "Dispacement too large for a short jmp");
1491 #endif
1492     intptr_t offs = (intptr_t)entry - (intptr_t)_code_pos;
1493     // 0111 tttn #8-bit disp
1494     emit_byte(0x70 | cc);
1495     emit_byte((offs - short_size) & 0xFF);
1496   } else {
1497     InstructionMark im(this);
1498     L.add_patch_at(code(), locator());
1499     emit_byte(0x70 | cc);
1500     emit_byte(0);
1501   }
1502 }
1503 
1504 void Assembler::jmp(Address adr) {
1505   InstructionMark im(this);
1506   prefix(adr);
1507   emit_byte(0xFF);
1508   emit_operand(rsp, adr);
1509 }
1510 
1511 void Assembler::jmp(Label& L, bool maybe_short) {
1512   if (L.is_bound()) {
1513     address entry = target(L);
1514     assert(entry != NULL, "jmp most probably wrong");
1515     InstructionMark im(this);
1516     const int short_size = 2;
1517     const int long_size = 5;
1518     intptr_t offs = entry - _code_pos;
1519     if (maybe_short && is8bit(offs - short_size)) {
1520       emit_byte(0xEB);
1521       emit_byte((offs - short_size) & 0xFF);
1522     } else {
1523       emit_byte(0xE9);
1524       emit_long(offs - long_size);
1525     }
1526   } else {
1527     // By default, forward jumps are always 32-bit displacements, since
1528     // we can't yet know where the label will be bound.  If you're sure that
1529     // the forward jump will not run beyond 256 bytes, use jmpb to
1530     // force an 8-bit displacement.
1531     InstructionMark im(this);
1532     L.add_patch_at(code(), locator());
1533     emit_byte(0xE9);
1534     emit_long(0);
1535   }
1536 }
1537 
1538 void Assembler::jmp(Register entry) {
1539   int encode = prefix_and_encode(entry->encoding());
1540   emit_byte(0xFF);
1541   emit_byte(0xE0 | encode);
1542 }
1543 
1544 void Assembler::jmp_literal(address dest, RelocationHolder const& rspec) {
1545   InstructionMark im(this);
1546   emit_byte(0xE9);
1547   assert(dest != NULL, "must have a target");
1548   intptr_t disp = dest - (_code_pos + sizeof(int32_t));
1549   assert(is_simm32(disp), "must be 32bit offset (jmp)");
1550   emit_data(disp, rspec.reloc(), call32_operand);
1551 }
1552 
1553 void Assembler::jmpb(Label& L) {
1554   if (L.is_bound()) {
1555     const int short_size = 2;
1556     address entry = target(L);
1557     assert(entry != NULL, "jmp most probably wrong");
1558 #ifdef ASSERT
1559     intptr_t dist = (intptr_t)entry - ((intptr_t)_code_pos + short_size);
1560     intptr_t delta = short_branch_delta();
1561     if (delta != 0) {
1562       dist += (dist < 0 ? (-delta) :delta);
1563     }
1564     assert(is8bit(dist), "Dispacement too large for a short jmp");
1565 #endif
1566     intptr_t offs = entry - _code_pos;
1567     emit_byte(0xEB);
1568     emit_byte((offs - short_size) & 0xFF);
1569   } else {
1570     InstructionMark im(this);
1571     L.add_patch_at(code(), locator());
1572     emit_byte(0xEB);
1573     emit_byte(0);
1574   }
1575 }
1576 
1577 void Assembler::ldmxcsr( Address src) {
1578   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1579   InstructionMark im(this);
1580   prefix(src);
1581   emit_byte(0x0F);
1582   emit_byte(0xAE);
1583   emit_operand(as_Register(2), src);
1584 }
1585 
1586 void Assembler::leal(Register dst, Address src) {
1587   InstructionMark im(this);
1588 #ifdef _LP64
1589   emit_byte(0x67); // addr32
1590   prefix(src, dst);
1591 #endif // LP64
1592   emit_byte(0x8D);
1593   emit_operand(dst, src);
1594 }
1595 
1596 void Assembler::lock() {
1597   if (Atomics & 1) {
1598      // Emit either nothing, a NOP, or a NOP: prefix
1599      emit_byte(0x90) ;
1600   } else {
1601      emit_byte(0xF0);
1602   }
1603 }
1604 
1605 void Assembler::lzcntl(Register dst, Register src) {
1606   assert(VM_Version::supports_lzcnt(), "encoding is treated as BSR");
1607   emit_byte(0xF3);
1608   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1609   emit_byte(0x0F);
1610   emit_byte(0xBD);
1611   emit_byte(0xC0 | encode);
1612 }
1613 
1614 // Emit mfence instruction
1615 void Assembler::mfence() {
1616   NOT_LP64(assert(VM_Version::supports_sse2(), "unsupported");)
1617   emit_byte( 0x0F );
1618   emit_byte( 0xAE );
1619   emit_byte( 0xF0 );
1620 }
1621 
1622 void Assembler::mov(Register dst, Register src) {
1623   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
1624 }
1625 
1626 void Assembler::movapd(XMMRegister dst, XMMRegister src) {
1627   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1628   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
1629   emit_byte(0x28);
1630   emit_byte(0xC0 | encode);
1631 }
1632 
1633 void Assembler::movaps(XMMRegister dst, XMMRegister src) {
1634   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1635   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
1636   emit_byte(0x28);
1637   emit_byte(0xC0 | encode);
1638 }
1639 
1640 void Assembler::movlhps(XMMRegister dst, XMMRegister src) {
1641   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1642   int encode = simd_prefix_and_encode(dst, src, src, VEX_SIMD_NONE);
1643   emit_byte(0x16);
1644   emit_byte(0xC0 | encode);
1645 }
1646 
1647 void Assembler::movb(Register dst, Address src) {
1648   NOT_LP64(assert(dst->has_byte_register(), "must have byte register"));
1649   InstructionMark im(this);
1650   prefix(src, dst, true);
1651   emit_byte(0x8A);
1652   emit_operand(dst, src);
1653 }
1654 
1655 
1656 void Assembler::movb(Address dst, int imm8) {
1657   InstructionMark im(this);
1658    prefix(dst);
1659   emit_byte(0xC6);
1660   emit_operand(rax, dst, 1);
1661   emit_byte(imm8);
1662 }
1663 
1664 
1665 void Assembler::movb(Address dst, Register src) {
1666   assert(src->has_byte_register(), "must have byte register");
1667   InstructionMark im(this);
1668   prefix(dst, src, true);
1669   emit_byte(0x88);
1670   emit_operand(src, dst);
1671 }
1672 
1673 void Assembler::movdl(XMMRegister dst, Register src) {
1674   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1675   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
1676   emit_byte(0x6E);
1677   emit_byte(0xC0 | encode);
1678 }
1679 
1680 void Assembler::movdl(Register dst, XMMRegister src) {
1681   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1682   // swap src/dst to get correct prefix
1683   int encode = simd_prefix_and_encode(src, dst, VEX_SIMD_66);
1684   emit_byte(0x7E);
1685   emit_byte(0xC0 | encode);
1686 }
1687 
1688 void Assembler::movdl(XMMRegister dst, Address src) {
1689   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1690   InstructionMark im(this);
1691   simd_prefix(dst, src, VEX_SIMD_66);
1692   emit_byte(0x6E);
1693   emit_operand(dst, src);
1694 }
1695 
1696 void Assembler::movdl(Address dst, XMMRegister src) {
1697   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1698   InstructionMark im(this);
1699   simd_prefix(dst, src, VEX_SIMD_66);
1700   emit_byte(0x7E);
1701   emit_operand(src, dst);
1702 }
1703 
1704 void Assembler::movdqa(XMMRegister dst, XMMRegister src) {
1705   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1706   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
1707   emit_byte(0x6F);
1708   emit_byte(0xC0 | encode);
1709 }
1710 
1711 void Assembler::movdqu(XMMRegister dst, Address src) {
1712   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1713   InstructionMark im(this);
1714   simd_prefix(dst, src, VEX_SIMD_F3);
1715   emit_byte(0x6F);
1716   emit_operand(dst, src);
1717 }
1718 
1719 void Assembler::movdqu(XMMRegister dst, XMMRegister src) {
1720   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1721   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3);
1722   emit_byte(0x6F);
1723   emit_byte(0xC0 | encode);
1724 }
1725 
1726 void Assembler::movdqu(Address dst, XMMRegister src) {
1727   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1728   InstructionMark im(this);
1729   simd_prefix(dst, src, VEX_SIMD_F3);
1730   emit_byte(0x7F);
1731   emit_operand(src, dst);
1732 }
1733 
1734 // Move Unaligned 256bit Vector
1735 void Assembler::vmovdqu(XMMRegister dst, XMMRegister src) {
1736   assert(UseAVX, "");
1737   bool vector256 = true;
1738   int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F3, vector256);
1739   emit_byte(0x6F);
1740   emit_byte(0xC0 | encode);
1741 }
1742 
1743 void Assembler::vmovdqu(XMMRegister dst, Address src) {
1744   assert(UseAVX, "");
1745   InstructionMark im(this);
1746   bool vector256 = true;
1747   vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector256);
1748   emit_byte(0x6F);
1749   emit_operand(dst, src);
1750 }
1751 
1752 void Assembler::vmovdqu(Address dst, XMMRegister src) {
1753   assert(UseAVX, "");
1754   InstructionMark im(this);
1755   bool vector256 = true;
1756   // swap src<->dst for encoding
1757   assert(src != xnoreg, "sanity");
1758   vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector256);
1759   emit_byte(0x7F);
1760   emit_operand(src, dst);
1761 }
1762 
1763 // Uses zero extension on 64bit
1764 
1765 void Assembler::movl(Register dst, int32_t imm32) {
1766   int encode = prefix_and_encode(dst->encoding());
1767   emit_byte(0xB8 | encode);
1768   emit_long(imm32);
1769 }
1770 
1771 void Assembler::movl(Register dst, Register src) {
1772   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1773   emit_byte(0x8B);
1774   emit_byte(0xC0 | encode);
1775 }
1776 
1777 void Assembler::movl(Register dst, Address src) {
1778   InstructionMark im(this);
1779   prefix(src, dst);
1780   emit_byte(0x8B);
1781   emit_operand(dst, src);
1782 }
1783 
1784 void Assembler::movl(Address dst, int32_t imm32) {
1785   InstructionMark im(this);
1786   prefix(dst);
1787   emit_byte(0xC7);
1788   emit_operand(rax, dst, 4);
1789   emit_long(imm32);
1790 }
1791 
1792 void Assembler::movl(Address dst, Register src) {
1793   InstructionMark im(this);
1794   prefix(dst, src);
1795   emit_byte(0x89);
1796   emit_operand(src, dst);
1797 }
1798 
1799 // New cpus require to use movsd and movss to avoid partial register stall
1800 // when loading from memory. But for old Opteron use movlpd instead of movsd.
1801 // The selection is done in MacroAssembler::movdbl() and movflt().
1802 void Assembler::movlpd(XMMRegister dst, Address src) {
1803   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1804   InstructionMark im(this);
1805   simd_prefix(dst, dst, src, VEX_SIMD_66);
1806   emit_byte(0x12);
1807   emit_operand(dst, src);
1808 }
1809 
1810 void Assembler::movq( MMXRegister dst, Address src ) {
1811   assert( VM_Version::supports_mmx(), "" );
1812   emit_byte(0x0F);
1813   emit_byte(0x6F);
1814   emit_operand(dst, src);
1815 }
1816 
1817 void Assembler::movq( Address dst, MMXRegister src ) {
1818   assert( VM_Version::supports_mmx(), "" );
1819   emit_byte(0x0F);
1820   emit_byte(0x7F);
1821   // workaround gcc (3.2.1-7a) bug
1822   // In that version of gcc with only an emit_operand(MMX, Address)
1823   // gcc will tail jump and try and reverse the parameters completely
1824   // obliterating dst in the process. By having a version available
1825   // that doesn't need to swap the args at the tail jump the bug is
1826   // avoided.
1827   emit_operand(dst, src);
1828 }
1829 
1830 void Assembler::movq(XMMRegister dst, Address src) {
1831   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1832   InstructionMark im(this);
1833   simd_prefix(dst, src, VEX_SIMD_F3);
1834   emit_byte(0x7E);
1835   emit_operand(dst, src);
1836 }
1837 
1838 void Assembler::movq(Address dst, XMMRegister src) {
1839   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1840   InstructionMark im(this);
1841   simd_prefix(dst, src, VEX_SIMD_66);
1842   emit_byte(0xD6);
1843   emit_operand(src, dst);
1844 }
1845 
1846 void Assembler::movsbl(Register dst, Address src) { // movsxb
1847   InstructionMark im(this);
1848   prefix(src, dst);
1849   emit_byte(0x0F);
1850   emit_byte(0xBE);
1851   emit_operand(dst, src);
1852 }
1853 
1854 void Assembler::movsbl(Register dst, Register src) { // movsxb
1855   NOT_LP64(assert(src->has_byte_register(), "must have byte register"));
1856   int encode = prefix_and_encode(dst->encoding(), src->encoding(), true);
1857   emit_byte(0x0F);
1858   emit_byte(0xBE);
1859   emit_byte(0xC0 | encode);
1860 }
1861 
1862 void Assembler::movsd(XMMRegister dst, XMMRegister src) {
1863   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1864   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
1865   emit_byte(0x10);
1866   emit_byte(0xC0 | encode);
1867 }
1868 
1869 void Assembler::movsd(XMMRegister dst, Address src) {
1870   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1871   InstructionMark im(this);
1872   simd_prefix(dst, src, VEX_SIMD_F2);
1873   emit_byte(0x10);
1874   emit_operand(dst, src);
1875 }
1876 
1877 void Assembler::movsd(Address dst, XMMRegister src) {
1878   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1879   InstructionMark im(this);
1880   simd_prefix(dst, src, VEX_SIMD_F2);
1881   emit_byte(0x11);
1882   emit_operand(src, dst);
1883 }
1884 
1885 void Assembler::movss(XMMRegister dst, XMMRegister src) {
1886   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1887   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
1888   emit_byte(0x10);
1889   emit_byte(0xC0 | encode);
1890 }
1891 
1892 void Assembler::movss(XMMRegister dst, Address src) {
1893   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1894   InstructionMark im(this);
1895   simd_prefix(dst, src, VEX_SIMD_F3);
1896   emit_byte(0x10);
1897   emit_operand(dst, src);
1898 }
1899 
1900 void Assembler::movss(Address dst, XMMRegister src) {
1901   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1902   InstructionMark im(this);
1903   simd_prefix(dst, src, VEX_SIMD_F3);
1904   emit_byte(0x11);
1905   emit_operand(src, dst);
1906 }
1907 
1908 void Assembler::movswl(Register dst, Address src) { // movsxw
1909   InstructionMark im(this);
1910   prefix(src, dst);
1911   emit_byte(0x0F);
1912   emit_byte(0xBF);
1913   emit_operand(dst, src);
1914 }
1915 
1916 void Assembler::movswl(Register dst, Register src) { // movsxw
1917   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1918   emit_byte(0x0F);
1919   emit_byte(0xBF);
1920   emit_byte(0xC0 | encode);
1921 }
1922 
1923 void Assembler::movw(Address dst, int imm16) {
1924   InstructionMark im(this);
1925 
1926   emit_byte(0x66); // switch to 16-bit mode
1927   prefix(dst);
1928   emit_byte(0xC7);
1929   emit_operand(rax, dst, 2);
1930   emit_word(imm16);
1931 }
1932 
1933 void Assembler::movw(Register dst, Address src) {
1934   InstructionMark im(this);
1935   emit_byte(0x66);
1936   prefix(src, dst);
1937   emit_byte(0x8B);
1938   emit_operand(dst, src);
1939 }
1940 
1941 void Assembler::movw(Address dst, Register src) {
1942   InstructionMark im(this);
1943   emit_byte(0x66);
1944   prefix(dst, src);
1945   emit_byte(0x89);
1946   emit_operand(src, dst);
1947 }
1948 
1949 void Assembler::movzbl(Register dst, Address src) { // movzxb
1950   InstructionMark im(this);
1951   prefix(src, dst);
1952   emit_byte(0x0F);
1953   emit_byte(0xB6);
1954   emit_operand(dst, src);
1955 }
1956 
1957 void Assembler::movzbl(Register dst, Register src) { // movzxb
1958   NOT_LP64(assert(src->has_byte_register(), "must have byte register"));
1959   int encode = prefix_and_encode(dst->encoding(), src->encoding(), true);
1960   emit_byte(0x0F);
1961   emit_byte(0xB6);
1962   emit_byte(0xC0 | encode);
1963 }
1964 
1965 void Assembler::movzwl(Register dst, Address src) { // movzxw
1966   InstructionMark im(this);
1967   prefix(src, dst);
1968   emit_byte(0x0F);
1969   emit_byte(0xB7);
1970   emit_operand(dst, src);
1971 }
1972 
1973 void Assembler::movzwl(Register dst, Register src) { // movzxw
1974   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1975   emit_byte(0x0F);
1976   emit_byte(0xB7);
1977   emit_byte(0xC0 | encode);
1978 }
1979 
1980 void Assembler::mull(Address src) {
1981   InstructionMark im(this);
1982   prefix(src);
1983   emit_byte(0xF7);
1984   emit_operand(rsp, src);
1985 }
1986 
1987 void Assembler::mull(Register src) {
1988   int encode = prefix_and_encode(src->encoding());
1989   emit_byte(0xF7);
1990   emit_byte(0xE0 | encode);
1991 }
1992 
1993 void Assembler::mulsd(XMMRegister dst, Address src) {
1994   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1995   InstructionMark im(this);
1996   simd_prefix(dst, dst, src, VEX_SIMD_F2);
1997   emit_byte(0x59);
1998   emit_operand(dst, src);
1999 }
2000 
2001 void Assembler::mulsd(XMMRegister dst, XMMRegister src) {
2002   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2003   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
2004   emit_byte(0x59);
2005   emit_byte(0xC0 | encode);
2006 }
2007 
2008 void Assembler::mulss(XMMRegister dst, Address src) {
2009   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2010   InstructionMark im(this);
2011   simd_prefix(dst, dst, src, VEX_SIMD_F3);
2012   emit_byte(0x59);
2013   emit_operand(dst, src);
2014 }
2015 
2016 void Assembler::mulss(XMMRegister dst, XMMRegister src) {
2017   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2018   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
2019   emit_byte(0x59);
2020   emit_byte(0xC0 | encode);
2021 }
2022 
2023 void Assembler::negl(Register dst) {
2024   int encode = prefix_and_encode(dst->encoding());
2025   emit_byte(0xF7);
2026   emit_byte(0xD8 | encode);
2027 }
2028 
2029 void Assembler::nop(int i) {
2030 #ifdef ASSERT
2031   assert(i > 0, " ");
2032   // The fancy nops aren't currently recognized by debuggers making it a
2033   // pain to disassemble code while debugging. If asserts are on clearly
2034   // speed is not an issue so simply use the single byte traditional nop
2035   // to do alignment.
2036 
2037   for (; i > 0 ; i--) emit_byte(0x90);
2038   return;
2039 
2040 #endif // ASSERT
2041 
2042   if (UseAddressNop && VM_Version::is_intel()) {
2043     //
2044     // Using multi-bytes nops "0x0F 0x1F [address]" for Intel
2045     //  1: 0x90
2046     //  2: 0x66 0x90
2047     //  3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
2048     //  4: 0x0F 0x1F 0x40 0x00
2049     //  5: 0x0F 0x1F 0x44 0x00 0x00
2050     //  6: 0x66 0x0F 0x1F 0x44 0x00 0x00
2051     //  7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2052     //  8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2053     //  9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2054     // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2055     // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2056 
2057     // The rest coding is Intel specific - don't use consecutive address nops
2058 
2059     // 12: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
2060     // 13: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
2061     // 14: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
2062     // 15: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
2063 
2064     while(i >= 15) {
2065       // For Intel don't generate consecutive addess nops (mix with regular nops)
2066       i -= 15;
2067       emit_byte(0x66);   // size prefix
2068       emit_byte(0x66);   // size prefix
2069       emit_byte(0x66);   // size prefix
2070       addr_nop_8();
2071       emit_byte(0x66);   // size prefix
2072       emit_byte(0x66);   // size prefix
2073       emit_byte(0x66);   // size prefix
2074       emit_byte(0x90);   // nop
2075     }
2076     switch (i) {
2077       case 14:
2078         emit_byte(0x66); // size prefix
2079       case 13:
2080         emit_byte(0x66); // size prefix
2081       case 12:
2082         addr_nop_8();
2083         emit_byte(0x66); // size prefix
2084         emit_byte(0x66); // size prefix
2085         emit_byte(0x66); // size prefix
2086         emit_byte(0x90); // nop
2087         break;
2088       case 11:
2089         emit_byte(0x66); // size prefix
2090       case 10:
2091         emit_byte(0x66); // size prefix
2092       case 9:
2093         emit_byte(0x66); // size prefix
2094       case 8:
2095         addr_nop_8();
2096         break;
2097       case 7:
2098         addr_nop_7();
2099         break;
2100       case 6:
2101         emit_byte(0x66); // size prefix
2102       case 5:
2103         addr_nop_5();
2104         break;
2105       case 4:
2106         addr_nop_4();
2107         break;
2108       case 3:
2109         // Don't use "0x0F 0x1F 0x00" - need patching safe padding
2110         emit_byte(0x66); // size prefix
2111       case 2:
2112         emit_byte(0x66); // size prefix
2113       case 1:
2114         emit_byte(0x90); // nop
2115         break;
2116       default:
2117         assert(i == 0, " ");
2118     }
2119     return;
2120   }
2121   if (UseAddressNop && VM_Version::is_amd()) {
2122     //
2123     // Using multi-bytes nops "0x0F 0x1F [address]" for AMD.
2124     //  1: 0x90
2125     //  2: 0x66 0x90
2126     //  3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
2127     //  4: 0x0F 0x1F 0x40 0x00
2128     //  5: 0x0F 0x1F 0x44 0x00 0x00
2129     //  6: 0x66 0x0F 0x1F 0x44 0x00 0x00
2130     //  7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2131     //  8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2132     //  9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2133     // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2134     // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2135 
2136     // The rest coding is AMD specific - use consecutive address nops
2137 
2138     // 12: 0x66 0x0F 0x1F 0x44 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2139     // 13: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2140     // 14: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2141     // 15: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2142     // 16: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2143     //     Size prefixes (0x66) are added for larger sizes
2144 
2145     while(i >= 22) {
2146       i -= 11;
2147       emit_byte(0x66); // size prefix
2148       emit_byte(0x66); // size prefix
2149       emit_byte(0x66); // size prefix
2150       addr_nop_8();
2151     }
2152     // Generate first nop for size between 21-12
2153     switch (i) {
2154       case 21:
2155         i -= 1;
2156         emit_byte(0x66); // size prefix
2157       case 20:
2158       case 19:
2159         i -= 1;
2160         emit_byte(0x66); // size prefix
2161       case 18:
2162       case 17:
2163         i -= 1;
2164         emit_byte(0x66); // size prefix
2165       case 16:
2166       case 15:
2167         i -= 8;
2168         addr_nop_8();
2169         break;
2170       case 14:
2171       case 13:
2172         i -= 7;
2173         addr_nop_7();
2174         break;
2175       case 12:
2176         i -= 6;
2177         emit_byte(0x66); // size prefix
2178         addr_nop_5();
2179         break;
2180       default:
2181         assert(i < 12, " ");
2182     }
2183 
2184     // Generate second nop for size between 11-1
2185     switch (i) {
2186       case 11:
2187         emit_byte(0x66); // size prefix
2188       case 10:
2189         emit_byte(0x66); // size prefix
2190       case 9:
2191         emit_byte(0x66); // size prefix
2192       case 8:
2193         addr_nop_8();
2194         break;
2195       case 7:
2196         addr_nop_7();
2197         break;
2198       case 6:
2199         emit_byte(0x66); // size prefix
2200       case 5:
2201         addr_nop_5();
2202         break;
2203       case 4:
2204         addr_nop_4();
2205         break;
2206       case 3:
2207         // Don't use "0x0F 0x1F 0x00" - need patching safe padding
2208         emit_byte(0x66); // size prefix
2209       case 2:
2210         emit_byte(0x66); // size prefix
2211       case 1:
2212         emit_byte(0x90); // nop
2213         break;
2214       default:
2215         assert(i == 0, " ");
2216     }
2217     return;
2218   }
2219 
2220   // Using nops with size prefixes "0x66 0x90".
2221   // From AMD Optimization Guide:
2222   //  1: 0x90
2223   //  2: 0x66 0x90
2224   //  3: 0x66 0x66 0x90
2225   //  4: 0x66 0x66 0x66 0x90
2226   //  5: 0x66 0x66 0x90 0x66 0x90
2227   //  6: 0x66 0x66 0x90 0x66 0x66 0x90
2228   //  7: 0x66 0x66 0x66 0x90 0x66 0x66 0x90
2229   //  8: 0x66 0x66 0x66 0x90 0x66 0x66 0x66 0x90
2230   //  9: 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2231   // 10: 0x66 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2232   //
2233   while(i > 12) {
2234     i -= 4;
2235     emit_byte(0x66); // size prefix
2236     emit_byte(0x66);
2237     emit_byte(0x66);
2238     emit_byte(0x90); // nop
2239   }
2240   // 1 - 12 nops
2241   if(i > 8) {
2242     if(i > 9) {
2243       i -= 1;
2244       emit_byte(0x66);
2245     }
2246     i -= 3;
2247     emit_byte(0x66);
2248     emit_byte(0x66);
2249     emit_byte(0x90);
2250   }
2251   // 1 - 8 nops
2252   if(i > 4) {
2253     if(i > 6) {
2254       i -= 1;
2255       emit_byte(0x66);
2256     }
2257     i -= 3;
2258     emit_byte(0x66);
2259     emit_byte(0x66);
2260     emit_byte(0x90);
2261   }
2262   switch (i) {
2263     case 4:
2264       emit_byte(0x66);
2265     case 3:
2266       emit_byte(0x66);
2267     case 2:
2268       emit_byte(0x66);
2269     case 1:
2270       emit_byte(0x90);
2271       break;
2272     default:
2273       assert(i == 0, " ");
2274   }
2275 }
2276 
2277 void Assembler::notl(Register dst) {
2278   int encode = prefix_and_encode(dst->encoding());
2279   emit_byte(0xF7);
2280   emit_byte(0xD0 | encode );
2281 }
2282 
2283 void Assembler::orl(Address dst, int32_t imm32) {
2284   InstructionMark im(this);
2285   prefix(dst);
2286   emit_arith_operand(0x81, rcx, dst, imm32);
2287 }
2288 
2289 void Assembler::orl(Register dst, int32_t imm32) {
2290   prefix(dst);
2291   emit_arith(0x81, 0xC8, dst, imm32);
2292 }
2293 
2294 void Assembler::orl(Register dst, Address src) {
2295   InstructionMark im(this);
2296   prefix(src, dst);
2297   emit_byte(0x0B);
2298   emit_operand(dst, src);
2299 }
2300 
2301 void Assembler::orl(Register dst, Register src) {
2302   (void) prefix_and_encode(dst->encoding(), src->encoding());
2303   emit_arith(0x0B, 0xC0, dst, src);
2304 }
2305 
2306 void Assembler::packuswb(XMMRegister dst, Address src) {
2307   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2308   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2309   InstructionMark im(this);
2310   simd_prefix(dst, dst, src, VEX_SIMD_66);
2311   emit_byte(0x67);
2312   emit_operand(dst, src);
2313 }
2314 
2315 void Assembler::packuswb(XMMRegister dst, XMMRegister src) {
2316   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2317   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
2318   emit_byte(0x67);
2319   emit_byte(0xC0 | encode);
2320 }
2321 
2322 void Assembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
2323   assert(VM_Version::supports_sse4_2(), "");
2324   InstructionMark im(this);
2325   simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
2326   emit_byte(0x61);
2327   emit_operand(dst, src);
2328   emit_byte(imm8);
2329 }
2330 
2331 void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
2332   assert(VM_Version::supports_sse4_2(), "");
2333   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
2334   emit_byte(0x61);
2335   emit_byte(0xC0 | encode);
2336   emit_byte(imm8);
2337 }
2338 
2339 void Assembler::pmovzxbw(XMMRegister dst, Address src) {
2340   assert(VM_Version::supports_sse4_1(), "");
2341   InstructionMark im(this);
2342   simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
2343   emit_byte(0x30);
2344   emit_operand(dst, src);
2345 }
2346 
2347 void Assembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
2348   assert(VM_Version::supports_sse4_1(), "");
2349   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
2350   emit_byte(0x30);
2351   emit_byte(0xC0 | encode);
2352 }
2353 
2354 // generic
2355 void Assembler::pop(Register dst) {
2356   int encode = prefix_and_encode(dst->encoding());
2357   emit_byte(0x58 | encode);
2358 }
2359 
2360 void Assembler::popcntl(Register dst, Address src) {
2361   assert(VM_Version::supports_popcnt(), "must support");
2362   InstructionMark im(this);
2363   emit_byte(0xF3);
2364   prefix(src, dst);
2365   emit_byte(0x0F);
2366   emit_byte(0xB8);
2367   emit_operand(dst, src);
2368 }
2369 
2370 void Assembler::popcntl(Register dst, Register src) {
2371   assert(VM_Version::supports_popcnt(), "must support");
2372   emit_byte(0xF3);
2373   int encode = prefix_and_encode(dst->encoding(), src->encoding());
2374   emit_byte(0x0F);
2375   emit_byte(0xB8);
2376   emit_byte(0xC0 | encode);
2377 }
2378 
2379 void Assembler::popf() {
2380   emit_byte(0x9D);
2381 }
2382 
2383 #ifndef _LP64 // no 32bit push/pop on amd64
2384 void Assembler::popl(Address dst) {
2385   // NOTE: this will adjust stack by 8byte on 64bits
2386   InstructionMark im(this);
2387   prefix(dst);
2388   emit_byte(0x8F);
2389   emit_operand(rax, dst);
2390 }
2391 #endif
2392 
2393 void Assembler::prefetch_prefix(Address src) {
2394   prefix(src);
2395   emit_byte(0x0F);
2396 }
2397 
2398 void Assembler::prefetchnta(Address src) {
2399   NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2400   InstructionMark im(this);
2401   prefetch_prefix(src);
2402   emit_byte(0x18);
2403   emit_operand(rax, src); // 0, src
2404 }
2405 
2406 void Assembler::prefetchr(Address src) {
2407   assert(VM_Version::supports_3dnow_prefetch(), "must support");
2408   InstructionMark im(this);
2409   prefetch_prefix(src);
2410   emit_byte(0x0D);
2411   emit_operand(rax, src); // 0, src
2412 }
2413 
2414 void Assembler::prefetcht0(Address src) {
2415   NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2416   InstructionMark im(this);
2417   prefetch_prefix(src);
2418   emit_byte(0x18);
2419   emit_operand(rcx, src); // 1, src
2420 }
2421 
2422 void Assembler::prefetcht1(Address src) {
2423   NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2424   InstructionMark im(this);
2425   prefetch_prefix(src);
2426   emit_byte(0x18);
2427   emit_operand(rdx, src); // 2, src
2428 }
2429 
2430 void Assembler::prefetcht2(Address src) {
2431   NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2432   InstructionMark im(this);
2433   prefetch_prefix(src);
2434   emit_byte(0x18);
2435   emit_operand(rbx, src); // 3, src
2436 }
2437 
2438 void Assembler::prefetchw(Address src) {
2439   assert(VM_Version::supports_3dnow_prefetch(), "must support");
2440   InstructionMark im(this);
2441   prefetch_prefix(src);
2442   emit_byte(0x0D);
2443   emit_operand(rcx, src); // 1, src
2444 }
2445 
2446 void Assembler::prefix(Prefix p) {
2447   a_byte(p);
2448 }
2449 
2450 void Assembler::por(XMMRegister dst, XMMRegister src) {
2451   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2452   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
2453   emit_byte(0xEB);
2454   emit_byte(0xC0 | encode);
2455 }
2456 
2457 void Assembler::por(XMMRegister dst, Address src) {
2458   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2459   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2460   InstructionMark im(this);
2461   simd_prefix(dst, dst, src, VEX_SIMD_66);
2462   emit_byte(0xEB);
2463   emit_operand(dst, src);
2464 }
2465 
2466 void Assembler::pshufd(XMMRegister dst, XMMRegister src, int mode) {
2467   assert(isByte(mode), "invalid value");
2468   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2469   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
2470   emit_byte(0x70);
2471   emit_byte(0xC0 | encode);
2472   emit_byte(mode & 0xFF);
2473 
2474 }
2475 
2476 void Assembler::pshufd(XMMRegister dst, Address src, int mode) {
2477   assert(isByte(mode), "invalid value");
2478   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2479   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2480   InstructionMark im(this);
2481   simd_prefix(dst, src, VEX_SIMD_66);
2482   emit_byte(0x70);
2483   emit_operand(dst, src);
2484   emit_byte(mode & 0xFF);
2485 }
2486 
2487 void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
2488   assert(isByte(mode), "invalid value");
2489   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2490   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2);
2491   emit_byte(0x70);
2492   emit_byte(0xC0 | encode);
2493   emit_byte(mode & 0xFF);
2494 }
2495 
2496 void Assembler::pshuflw(XMMRegister dst, Address src, int mode) {
2497   assert(isByte(mode), "invalid value");
2498   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2499   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2500   InstructionMark im(this);
2501   simd_prefix(dst, src, VEX_SIMD_F2);
2502   emit_byte(0x70);
2503   emit_operand(dst, src);
2504   emit_byte(mode & 0xFF);
2505 }
2506 
2507 void Assembler::psrlq(XMMRegister dst, int shift) {
2508   // Shift 64 bit value logically right by specified number of bits.
2509   // HMM Table D-1 says sse2 or mmx.
2510   // Do not confuse it with psrldq SSE2 instruction which
2511   // shifts 128 bit value in xmm register by number of bytes.
2512   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2513   int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66);
2514   emit_byte(0x73);
2515   emit_byte(0xC0 | encode);
2516   emit_byte(shift);
2517 }
2518 
2519 void Assembler::psrldq(XMMRegister dst, int shift) {
2520   // Shift 128 bit value in xmm register by number of bytes.
2521   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2522   int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66);
2523   emit_byte(0x73);
2524   emit_byte(0xC0 | encode);
2525   emit_byte(shift);
2526 }
2527 
2528 void Assembler::ptest(XMMRegister dst, Address src) {
2529   assert(VM_Version::supports_sse4_1(), "");
2530   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2531   InstructionMark im(this);
2532   simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
2533   emit_byte(0x17);
2534   emit_operand(dst, src);
2535 }
2536 
2537 void Assembler::ptest(XMMRegister dst, XMMRegister src) {
2538   assert(VM_Version::supports_sse4_1(), "");
2539   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
2540   emit_byte(0x17);
2541   emit_byte(0xC0 | encode);
2542 }
2543 
2544 void Assembler::punpcklbw(XMMRegister dst, Address src) {
2545   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2546   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2547   InstructionMark im(this);
2548   simd_prefix(dst, dst, src, VEX_SIMD_66);
2549   emit_byte(0x60);
2550   emit_operand(dst, src);
2551 }
2552 
2553 void Assembler::punpcklbw(XMMRegister dst, XMMRegister src) {
2554   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2555   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
2556   emit_byte(0x60);
2557   emit_byte(0xC0 | encode);
2558 }
2559 
2560 void Assembler::punpckldq(XMMRegister dst, Address src) {
2561   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2562   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2563   InstructionMark im(this);
2564   simd_prefix(dst, dst, src, VEX_SIMD_66);
2565   emit_byte(0x62);
2566   emit_operand(dst, src);
2567 }
2568 
2569 void Assembler::punpckldq(XMMRegister dst, XMMRegister src) {
2570   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2571   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
2572   emit_byte(0x62);
2573   emit_byte(0xC0 | encode);
2574 }
2575 
2576 void Assembler::punpcklqdq(XMMRegister dst, XMMRegister src) {
2577   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2578   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
2579   emit_byte(0x6C);
2580   emit_byte(0xC0 | encode);
2581 }
2582 
2583 void Assembler::push(int32_t imm32) {
2584   // in 64bits we push 64bits onto the stack but only
2585   // take a 32bit immediate
2586   emit_byte(0x68);
2587   emit_long(imm32);
2588 }
2589 
2590 void Assembler::push(Register src) {
2591   int encode = prefix_and_encode(src->encoding());
2592 
2593   emit_byte(0x50 | encode);
2594 }
2595 
2596 void Assembler::pushf() {
2597   emit_byte(0x9C);
2598 }
2599 
2600 #ifndef _LP64 // no 32bit push/pop on amd64
2601 void Assembler::pushl(Address src) {
2602   // Note this will push 64bit on 64bit
2603   InstructionMark im(this);
2604   prefix(src);
2605   emit_byte(0xFF);
2606   emit_operand(rsi, src);
2607 }
2608 #endif
2609 
2610 void Assembler::pxor(XMMRegister dst, Address src) {
2611   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2612   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2613   InstructionMark im(this);
2614   simd_prefix(dst, dst, src, VEX_SIMD_66);
2615   emit_byte(0xEF);
2616   emit_operand(dst, src);
2617 }
2618 
2619 void Assembler::pxor(XMMRegister dst, XMMRegister src) {
2620   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2621   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
2622   emit_byte(0xEF);
2623   emit_byte(0xC0 | encode);
2624 }
2625 
2626 void Assembler::rcll(Register dst, int imm8) {
2627   assert(isShiftCount(imm8), "illegal shift count");
2628   int encode = prefix_and_encode(dst->encoding());
2629   if (imm8 == 1) {
2630     emit_byte(0xD1);
2631     emit_byte(0xD0 | encode);
2632   } else {
2633     emit_byte(0xC1);
2634     emit_byte(0xD0 | encode);
2635     emit_byte(imm8);
2636   }
2637 }
2638 
2639 // copies data from [esi] to [edi] using rcx pointer sized words
2640 // generic
2641 void Assembler::rep_mov() {
2642   emit_byte(0xF3);
2643   // MOVSQ
2644   LP64_ONLY(prefix(REX_W));
2645   emit_byte(0xA5);
2646 }
2647 
2648 // sets rcx pointer sized words with rax, value at [edi]
2649 // generic
2650 void Assembler::rep_set() { // rep_set
2651   emit_byte(0xF3);
2652   // STOSQ
2653   LP64_ONLY(prefix(REX_W));
2654   emit_byte(0xAB);
2655 }
2656 
2657 // scans rcx pointer sized words at [edi] for occurance of rax,
2658 // generic
2659 void Assembler::repne_scan() { // repne_scan
2660   emit_byte(0xF2);
2661   // SCASQ
2662   LP64_ONLY(prefix(REX_W));
2663   emit_byte(0xAF);
2664 }
2665 
2666 #ifdef _LP64
2667 // scans rcx 4 byte words at [edi] for occurance of rax,
2668 // generic
2669 void Assembler::repne_scanl() { // repne_scan
2670   emit_byte(0xF2);
2671   // SCASL
2672   emit_byte(0xAF);
2673 }
2674 #endif
2675 
2676 void Assembler::ret(int imm16) {
2677   if (imm16 == 0) {
2678     emit_byte(0xC3);
2679   } else {
2680     emit_byte(0xC2);
2681     emit_word(imm16);
2682   }
2683 }
2684 
2685 void Assembler::sahf() {
2686 #ifdef _LP64
2687   // Not supported in 64bit mode
2688   ShouldNotReachHere();
2689 #endif
2690   emit_byte(0x9E);
2691 }
2692 
2693 void Assembler::sarl(Register dst, int imm8) {
2694   int encode = prefix_and_encode(dst->encoding());
2695   assert(isShiftCount(imm8), "illegal shift count");
2696   if (imm8 == 1) {
2697     emit_byte(0xD1);
2698     emit_byte(0xF8 | encode);
2699   } else {
2700     emit_byte(0xC1);
2701     emit_byte(0xF8 | encode);
2702     emit_byte(imm8);
2703   }
2704 }
2705 
2706 void Assembler::sarl(Register dst) {
2707   int encode = prefix_and_encode(dst->encoding());
2708   emit_byte(0xD3);
2709   emit_byte(0xF8 | encode);
2710 }
2711 
2712 void Assembler::sbbl(Address dst, int32_t imm32) {
2713   InstructionMark im(this);
2714   prefix(dst);
2715   emit_arith_operand(0x81, rbx, dst, imm32);
2716 }
2717 
2718 void Assembler::sbbl(Register dst, int32_t imm32) {
2719   prefix(dst);
2720   emit_arith(0x81, 0xD8, dst, imm32);
2721 }
2722 
2723 
2724 void Assembler::sbbl(Register dst, Address src) {
2725   InstructionMark im(this);
2726   prefix(src, dst);
2727   emit_byte(0x1B);
2728   emit_operand(dst, src);
2729 }
2730 
2731 void Assembler::sbbl(Register dst, Register src) {
2732   (void) prefix_and_encode(dst->encoding(), src->encoding());
2733   emit_arith(0x1B, 0xC0, dst, src);
2734 }
2735 
2736 void Assembler::setb(Condition cc, Register dst) {
2737   assert(0 <= cc && cc < 16, "illegal cc");
2738   int encode = prefix_and_encode(dst->encoding(), true);
2739   emit_byte(0x0F);
2740   emit_byte(0x90 | cc);
2741   emit_byte(0xC0 | encode);
2742 }
2743 
2744 void Assembler::shll(Register dst, int imm8) {
2745   assert(isShiftCount(imm8), "illegal shift count");
2746   int encode = prefix_and_encode(dst->encoding());
2747   if (imm8 == 1 ) {
2748     emit_byte(0xD1);
2749     emit_byte(0xE0 | encode);
2750   } else {
2751     emit_byte(0xC1);
2752     emit_byte(0xE0 | encode);
2753     emit_byte(imm8);
2754   }
2755 }
2756 
2757 void Assembler::shll(Register dst) {
2758   int encode = prefix_and_encode(dst->encoding());
2759   emit_byte(0xD3);
2760   emit_byte(0xE0 | encode);
2761 }
2762 
2763 void Assembler::shrl(Register dst, int imm8) {
2764   assert(isShiftCount(imm8), "illegal shift count");
2765   int encode = prefix_and_encode(dst->encoding());
2766   emit_byte(0xC1);
2767   emit_byte(0xE8 | encode);
2768   emit_byte(imm8);
2769 }
2770 
2771 void Assembler::shrl(Register dst) {
2772   int encode = prefix_and_encode(dst->encoding());
2773   emit_byte(0xD3);
2774   emit_byte(0xE8 | encode);
2775 }
2776 
2777 // copies a single word from [esi] to [edi]
2778 void Assembler::smovl() {
2779   emit_byte(0xA5);
2780 }
2781 
2782 void Assembler::sqrtsd(XMMRegister dst, XMMRegister src) {
2783   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2784   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
2785   emit_byte(0x51);
2786   emit_byte(0xC0 | encode);
2787 }
2788 
2789 void Assembler::sqrtsd(XMMRegister dst, Address src) {
2790   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2791   InstructionMark im(this);
2792   simd_prefix(dst, dst, src, VEX_SIMD_F2);
2793   emit_byte(0x51);
2794   emit_operand(dst, src);
2795 }
2796 
2797 void Assembler::sqrtss(XMMRegister dst, XMMRegister src) {
2798   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2799   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
2800   emit_byte(0x51);
2801   emit_byte(0xC0 | encode);
2802 }
2803 
2804 void Assembler::sqrtss(XMMRegister dst, Address src) {
2805   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2806   InstructionMark im(this);
2807   simd_prefix(dst, dst, src, VEX_SIMD_F3);
2808   emit_byte(0x51);
2809   emit_operand(dst, src);
2810 }
2811 
2812 void Assembler::stmxcsr( Address dst) {
2813   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2814   InstructionMark im(this);
2815   prefix(dst);
2816   emit_byte(0x0F);
2817   emit_byte(0xAE);
2818   emit_operand(as_Register(3), dst);
2819 }
2820 
2821 void Assembler::subl(Address dst, int32_t imm32) {
2822   InstructionMark im(this);
2823   prefix(dst);
2824   emit_arith_operand(0x81, rbp, dst, imm32);
2825 }
2826 
2827 void Assembler::subl(Address dst, Register src) {
2828   InstructionMark im(this);
2829   prefix(dst, src);
2830   emit_byte(0x29);
2831   emit_operand(src, dst);
2832 }
2833 
2834 void Assembler::subl(Register dst, int32_t imm32) {
2835   prefix(dst);
2836   emit_arith(0x81, 0xE8, dst, imm32);
2837 }
2838 
2839 // Force generation of a 4 byte immediate value even if it fits into 8bit
2840 void Assembler::subl_imm32(Register dst, int32_t imm32) {
2841   prefix(dst);
2842   emit_arith_imm32(0x81, 0xE8, dst, imm32);
2843 }
2844 
2845 void Assembler::subl(Register dst, Address src) {
2846   InstructionMark im(this);
2847   prefix(src, dst);
2848   emit_byte(0x2B);
2849   emit_operand(dst, src);
2850 }
2851 
2852 void Assembler::subl(Register dst, Register src) {
2853   (void) prefix_and_encode(dst->encoding(), src->encoding());
2854   emit_arith(0x2B, 0xC0, dst, src);
2855 }
2856 
2857 void Assembler::subsd(XMMRegister dst, XMMRegister src) {
2858   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2859   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
2860   emit_byte(0x5C);
2861   emit_byte(0xC0 | encode);
2862 }
2863 
2864 void Assembler::subsd(XMMRegister dst, Address src) {
2865   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2866   InstructionMark im(this);
2867   simd_prefix(dst, dst, src, VEX_SIMD_F2);
2868   emit_byte(0x5C);
2869   emit_operand(dst, src);
2870 }
2871 
2872 void Assembler::subss(XMMRegister dst, XMMRegister src) {
2873   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2874   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
2875   emit_byte(0x5C);
2876   emit_byte(0xC0 | encode);
2877 }
2878 
2879 void Assembler::subss(XMMRegister dst, Address src) {
2880   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2881   InstructionMark im(this);
2882   simd_prefix(dst, dst, src, VEX_SIMD_F3);
2883   emit_byte(0x5C);
2884   emit_operand(dst, src);
2885 }
2886 
2887 void Assembler::testb(Register dst, int imm8) {
2888   NOT_LP64(assert(dst->has_byte_register(), "must have byte register"));
2889   (void) prefix_and_encode(dst->encoding(), true);
2890   emit_arith_b(0xF6, 0xC0, dst, imm8);
2891 }
2892 
2893 void Assembler::testl(Register dst, int32_t imm32) {
2894   // not using emit_arith because test
2895   // doesn't support sign-extension of
2896   // 8bit operands
2897   int encode = dst->encoding();
2898   if (encode == 0) {
2899     emit_byte(0xA9);
2900   } else {
2901     encode = prefix_and_encode(encode);
2902     emit_byte(0xF7);
2903     emit_byte(0xC0 | encode);
2904   }
2905   emit_long(imm32);
2906 }
2907 
2908 void Assembler::testl(Register dst, Register src) {
2909   (void) prefix_and_encode(dst->encoding(), src->encoding());
2910   emit_arith(0x85, 0xC0, dst, src);
2911 }
2912 
2913 void Assembler::testl(Register dst, Address  src) {
2914   InstructionMark im(this);
2915   prefix(src, dst);
2916   emit_byte(0x85);
2917   emit_operand(dst, src);
2918 }
2919 
2920 void Assembler::ucomisd(XMMRegister dst, Address src) {
2921   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2922   InstructionMark im(this);
2923   simd_prefix(dst, src, VEX_SIMD_66);
2924   emit_byte(0x2E);
2925   emit_operand(dst, src);
2926 }
2927 
2928 void Assembler::ucomisd(XMMRegister dst, XMMRegister src) {
2929   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2930   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
2931   emit_byte(0x2E);
2932   emit_byte(0xC0 | encode);
2933 }
2934 
2935 void Assembler::ucomiss(XMMRegister dst, Address src) {
2936   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2937   InstructionMark im(this);
2938   simd_prefix(dst, src, VEX_SIMD_NONE);
2939   emit_byte(0x2E);
2940   emit_operand(dst, src);
2941 }
2942 
2943 void Assembler::ucomiss(XMMRegister dst, XMMRegister src) {
2944   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2945   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
2946   emit_byte(0x2E);
2947   emit_byte(0xC0 | encode);
2948 }
2949 
2950 
2951 void Assembler::xaddl(Address dst, Register src) {
2952   InstructionMark im(this);
2953   prefix(dst, src);
2954   emit_byte(0x0F);
2955   emit_byte(0xC1);
2956   emit_operand(src, dst);
2957 }
2958 
2959 void Assembler::xchgl(Register dst, Address src) { // xchg
2960   InstructionMark im(this);
2961   prefix(src, dst);
2962   emit_byte(0x87);
2963   emit_operand(dst, src);
2964 }
2965 
2966 void Assembler::xchgl(Register dst, Register src) {
2967   int encode = prefix_and_encode(dst->encoding(), src->encoding());
2968   emit_byte(0x87);
2969   emit_byte(0xc0 | encode);
2970 }
2971 
2972 void Assembler::xorl(Register dst, int32_t imm32) {
2973   prefix(dst);
2974   emit_arith(0x81, 0xF0, dst, imm32);
2975 }
2976 
2977 void Assembler::xorl(Register dst, Address src) {
2978   InstructionMark im(this);
2979   prefix(src, dst);
2980   emit_byte(0x33);
2981   emit_operand(dst, src);
2982 }
2983 
2984 void Assembler::xorl(Register dst, Register src) {
2985   (void) prefix_and_encode(dst->encoding(), src->encoding());
2986   emit_arith(0x33, 0xC0, dst, src);
2987 }
2988 
2989 void Assembler::xorpd(XMMRegister dst, XMMRegister src) {
2990   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2991   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
2992   emit_byte(0x57);
2993   emit_byte(0xC0 | encode);
2994 }
2995 
2996 void Assembler::xorpd(XMMRegister dst, Address src) {
2997   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2998   InstructionMark im(this);
2999   simd_prefix(dst, dst, src, VEX_SIMD_66);
3000   emit_byte(0x57);
3001   emit_operand(dst, src);
3002 }
3003 
3004 
3005 void Assembler::xorps(XMMRegister dst, XMMRegister src) {
3006   NOT_LP64(assert(VM_Version::supports_sse(), ""));
3007   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_NONE);
3008   emit_byte(0x57);
3009   emit_byte(0xC0 | encode);
3010 }
3011 
3012 void Assembler::xorps(XMMRegister dst, Address src) {
3013   NOT_LP64(assert(VM_Version::supports_sse(), ""));
3014   InstructionMark im(this);
3015   simd_prefix(dst, dst, src, VEX_SIMD_NONE);
3016   emit_byte(0x57);
3017   emit_operand(dst, src);
3018 }
3019 
3020 // AVX 3-operands non destructive source instructions (encoded with VEX prefix)
3021 
3022 void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, Address src) {
3023   assert(VM_Version::supports_avx(), "");
3024   InstructionMark im(this);
3025   vex_prefix(dst, nds, src, VEX_SIMD_F2);
3026   emit_byte(0x58);
3027   emit_operand(dst, src);
3028 }
3029 
3030 void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3031   assert(VM_Version::supports_avx(), "");
3032   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
3033   emit_byte(0x58);
3034   emit_byte(0xC0 | encode);
3035 }
3036 
3037 void Assembler::vaddss(XMMRegister dst, XMMRegister nds, Address src) {
3038   assert(VM_Version::supports_avx(), "");
3039   InstructionMark im(this);
3040   vex_prefix(dst, nds, src, VEX_SIMD_F3);
3041   emit_byte(0x58);
3042   emit_operand(dst, src);
3043 }
3044 
3045 void Assembler::vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3046   assert(VM_Version::supports_avx(), "");
3047   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
3048   emit_byte(0x58);
3049   emit_byte(0xC0 | encode);
3050 }
3051 
3052 void Assembler::vandpd(XMMRegister dst, XMMRegister nds, Address src) {
3053   assert(VM_Version::supports_avx(), "");
3054   InstructionMark im(this);
3055   vex_prefix(dst, nds, src, VEX_SIMD_66); // 128-bit vector
3056   emit_byte(0x54);
3057   emit_operand(dst, src);
3058 }
3059 
3060 void Assembler::vandps(XMMRegister dst, XMMRegister nds, Address src) {
3061   assert(VM_Version::supports_avx(), "");
3062   InstructionMark im(this);
3063   vex_prefix(dst, nds, src, VEX_SIMD_NONE); // 128-bit vector
3064   emit_byte(0x54);
3065   emit_operand(dst, src);
3066 }
3067 
3068 void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, Address src) {
3069   assert(VM_Version::supports_avx(), "");
3070   InstructionMark im(this);
3071   vex_prefix(dst, nds, src, VEX_SIMD_F2);
3072   emit_byte(0x5E);
3073   emit_operand(dst, src);
3074 }
3075 
3076 void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3077   assert(VM_Version::supports_avx(), "");
3078   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
3079   emit_byte(0x5E);
3080   emit_byte(0xC0 | encode);
3081 }
3082 
3083 void Assembler::vdivss(XMMRegister dst, XMMRegister nds, Address src) {
3084   assert(VM_Version::supports_avx(), "");
3085   InstructionMark im(this);
3086   vex_prefix(dst, nds, src, VEX_SIMD_F3);
3087   emit_byte(0x5E);
3088   emit_operand(dst, src);
3089 }
3090 
3091 void Assembler::vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3092   assert(VM_Version::supports_avx(), "");
3093   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
3094   emit_byte(0x5E);
3095   emit_byte(0xC0 | encode);
3096 }
3097 
3098 void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) {
3099   assert(VM_Version::supports_avx(), "");
3100   InstructionMark im(this);
3101   vex_prefix(dst, nds, src, VEX_SIMD_F2);
3102   emit_byte(0x59);
3103   emit_operand(dst, src);
3104 }
3105 
3106 void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3107   assert(VM_Version::supports_avx(), "");
3108   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
3109   emit_byte(0x59);
3110   emit_byte(0xC0 | encode);
3111 }
3112 
3113 void Assembler::vmulss(XMMRegister dst, XMMRegister nds, Address src) {
3114   InstructionMark im(this);
3115   vex_prefix(dst, nds, src, VEX_SIMD_F3);
3116   emit_byte(0x59);
3117   emit_operand(dst, src);
3118 }
3119 
3120 void Assembler::vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3121   assert(VM_Version::supports_avx(), "");
3122   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
3123   emit_byte(0x59);
3124   emit_byte(0xC0 | encode);
3125 }
3126 
3127 
3128 void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, Address src) {
3129   assert(VM_Version::supports_avx(), "");
3130   InstructionMark im(this);
3131   vex_prefix(dst, nds, src, VEX_SIMD_F2);
3132   emit_byte(0x5C);
3133   emit_operand(dst, src);
3134 }
3135 
3136 void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3137   assert(VM_Version::supports_avx(), "");
3138   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
3139   emit_byte(0x5C);
3140   emit_byte(0xC0 | encode);
3141 }
3142 
3143 void Assembler::vsubss(XMMRegister dst, XMMRegister nds, Address src) {
3144   assert(VM_Version::supports_avx(), "");
3145   InstructionMark im(this);
3146   vex_prefix(dst, nds, src, VEX_SIMD_F3);
3147   emit_byte(0x5C);
3148   emit_operand(dst, src);
3149 }
3150 
3151 void Assembler::vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3152   assert(VM_Version::supports_avx(), "");
3153   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
3154   emit_byte(0x5C);
3155   emit_byte(0xC0 | encode);
3156 }
3157 
3158 void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, Address src) {
3159   assert(VM_Version::supports_avx(), "");
3160   InstructionMark im(this);
3161   vex_prefix(dst, nds, src, VEX_SIMD_66); // 128-bit vector
3162   emit_byte(0x57);
3163   emit_operand(dst, src);
3164 }
3165 
3166 void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3167   assert(VM_Version::supports_avx(), "");
3168   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256);
3169   emit_byte(0x57);
3170   emit_byte(0xC0 | encode);
3171 }
3172 
3173 void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src) {
3174   assert(VM_Version::supports_avx(), "");
3175   InstructionMark im(this);
3176   vex_prefix(dst, nds, src, VEX_SIMD_NONE); // 128-bit vector
3177   emit_byte(0x57);
3178   emit_operand(dst, src);
3179 }
3180 
3181 void Assembler::vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3182   assert(VM_Version::supports_avx(), "");
3183   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_NONE, vector256);
3184   emit_byte(0x57);
3185   emit_byte(0xC0 | encode);
3186 }
3187 
3188 void Assembler::vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3189   assert(VM_Version::supports_avx2() || (!vector256) && VM_Version::supports_avx(), "");
3190   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256);
3191   emit_byte(0xEF);
3192   emit_byte(0xC0 | encode);
3193 }
3194 
3195 void Assembler::vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3196   assert(VM_Version::supports_avx(), "");
3197   bool vector256 = true;
3198   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A);
3199   emit_byte(0x18);
3200   emit_byte(0xC0 | encode);
3201   // 0x00 - insert into lower 128 bits
3202   // 0x01 - insert into upper 128 bits
3203   emit_byte(0x01);
3204 }
3205 
3206 void Assembler::vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3207   assert(VM_Version::supports_avx2(), "");
3208   bool vector256 = true;
3209   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A);
3210   emit_byte(0x38);
3211   emit_byte(0xC0 | encode);
3212   // 0x00 - insert into lower 128 bits
3213   // 0x01 - insert into upper 128 bits
3214   emit_byte(0x01);
3215 }
3216 
3217 void Assembler::vzeroupper() {
3218   assert(VM_Version::supports_avx(), "");
3219   (void)vex_prefix_and_encode(xmm0, xmm0, xmm0, VEX_SIMD_NONE);
3220   emit_byte(0x77);
3221 }
3222 
3223 
3224 #ifndef _LP64
3225 // 32bit only pieces of the assembler
3226 
3227 void Assembler::cmp_literal32(Register src1, int32_t imm32, RelocationHolder const& rspec) {
3228   // NO PREFIX AS NEVER 64BIT
3229   InstructionMark im(this);
3230   emit_byte(0x81);
3231   emit_byte(0xF8 | src1->encoding());
3232   emit_data(imm32, rspec, 0);
3233 }
3234 
3235 void Assembler::cmp_literal32(Address src1, int32_t imm32, RelocationHolder const& rspec) {
3236   // NO PREFIX AS NEVER 64BIT (not even 32bit versions of 64bit regs
3237   InstructionMark im(this);
3238   emit_byte(0x81);
3239   emit_operand(rdi, src1);
3240   emit_data(imm32, rspec, 0);
3241 }
3242 
3243 // The 64-bit (32bit platform) cmpxchg compares the value at adr with the contents of rdx:rax,
3244 // and stores rcx:rbx into adr if so; otherwise, the value at adr is loaded
3245 // into rdx:rax.  The ZF is set if the compared values were equal, and cleared otherwise.
3246 void Assembler::cmpxchg8(Address adr) {
3247   InstructionMark im(this);
3248   emit_byte(0x0F);
3249   emit_byte(0xc7);
3250   emit_operand(rcx, adr);
3251 }
3252 
3253 void Assembler::decl(Register dst) {
3254   // Don't use it directly. Use MacroAssembler::decrementl() instead.
3255  emit_byte(0x48 | dst->encoding());
3256 }
3257 
3258 #endif // _LP64
3259 
3260 // 64bit typically doesn't use the x87 but needs to for the trig funcs
3261 
3262 void Assembler::fabs() {
3263   emit_byte(0xD9);
3264   emit_byte(0xE1);
3265 }
3266 
3267 void Assembler::fadd(int i) {
3268   emit_farith(0xD8, 0xC0, i);
3269 }
3270 
3271 void Assembler::fadd_d(Address src) {
3272   InstructionMark im(this);
3273   emit_byte(0xDC);
3274   emit_operand32(rax, src);
3275 }
3276 
3277 void Assembler::fadd_s(Address src) {
3278   InstructionMark im(this);
3279   emit_byte(0xD8);
3280   emit_operand32(rax, src);
3281 }
3282 
3283 void Assembler::fadda(int i) {
3284   emit_farith(0xDC, 0xC0, i);
3285 }
3286 
3287 void Assembler::faddp(int i) {
3288   emit_farith(0xDE, 0xC0, i);
3289 }
3290 
3291 void Assembler::fchs() {
3292   emit_byte(0xD9);
3293   emit_byte(0xE0);
3294 }
3295 
3296 void Assembler::fcom(int i) {
3297   emit_farith(0xD8, 0xD0, i);
3298 }
3299 
3300 void Assembler::fcomp(int i) {
3301   emit_farith(0xD8, 0xD8, i);
3302 }
3303 
3304 void Assembler::fcomp_d(Address src) {
3305   InstructionMark im(this);
3306   emit_byte(0xDC);
3307   emit_operand32(rbx, src);
3308 }
3309 
3310 void Assembler::fcomp_s(Address src) {
3311   InstructionMark im(this);
3312   emit_byte(0xD8);
3313   emit_operand32(rbx, src);
3314 }
3315 
3316 void Assembler::fcompp() {
3317   emit_byte(0xDE);
3318   emit_byte(0xD9);
3319 }
3320 
3321 void Assembler::fcos() {
3322   emit_byte(0xD9);
3323   emit_byte(0xFF);
3324 }
3325 
3326 void Assembler::fdecstp() {
3327   emit_byte(0xD9);
3328   emit_byte(0xF6);
3329 }
3330 
3331 void Assembler::fdiv(int i) {
3332   emit_farith(0xD8, 0xF0, i);
3333 }
3334 
3335 void Assembler::fdiv_d(Address src) {
3336   InstructionMark im(this);
3337   emit_byte(0xDC);
3338   emit_operand32(rsi, src);
3339 }
3340 
3341 void Assembler::fdiv_s(Address src) {
3342   InstructionMark im(this);
3343   emit_byte(0xD8);
3344   emit_operand32(rsi, src);
3345 }
3346 
3347 void Assembler::fdiva(int i) {
3348   emit_farith(0xDC, 0xF8, i);
3349 }
3350 
3351 // Note: The Intel manual (Pentium Processor User's Manual, Vol.3, 1994)
3352 //       is erroneous for some of the floating-point instructions below.
3353 
3354 void Assembler::fdivp(int i) {
3355   emit_farith(0xDE, 0xF8, i);                    // ST(0) <- ST(0) / ST(1) and pop (Intel manual wrong)
3356 }
3357 
3358 void Assembler::fdivr(int i) {
3359   emit_farith(0xD8, 0xF8, i);
3360 }
3361 
3362 void Assembler::fdivr_d(Address src) {
3363   InstructionMark im(this);
3364   emit_byte(0xDC);
3365   emit_operand32(rdi, src);
3366 }
3367 
3368 void Assembler::fdivr_s(Address src) {
3369   InstructionMark im(this);
3370   emit_byte(0xD8);
3371   emit_operand32(rdi, src);
3372 }
3373 
3374 void Assembler::fdivra(int i) {
3375   emit_farith(0xDC, 0xF0, i);
3376 }
3377 
3378 void Assembler::fdivrp(int i) {
3379   emit_farith(0xDE, 0xF0, i);                    // ST(0) <- ST(1) / ST(0) and pop (Intel manual wrong)
3380 }
3381 
3382 void Assembler::ffree(int i) {
3383   emit_farith(0xDD, 0xC0, i);
3384 }
3385 
3386 void Assembler::fild_d(Address adr) {
3387   InstructionMark im(this);
3388   emit_byte(0xDF);
3389   emit_operand32(rbp, adr);
3390 }
3391 
3392 void Assembler::fild_s(Address adr) {
3393   InstructionMark im(this);
3394   emit_byte(0xDB);
3395   emit_operand32(rax, adr);
3396 }
3397 
3398 void Assembler::fincstp() {
3399   emit_byte(0xD9);
3400   emit_byte(0xF7);
3401 }
3402 
3403 void Assembler::finit() {
3404   emit_byte(0x9B);
3405   emit_byte(0xDB);
3406   emit_byte(0xE3);
3407 }
3408 
3409 void Assembler::fist_s(Address adr) {
3410   InstructionMark im(this);
3411   emit_byte(0xDB);
3412   emit_operand32(rdx, adr);
3413 }
3414 
3415 void Assembler::fistp_d(Address adr) {
3416   InstructionMark im(this);
3417   emit_byte(0xDF);
3418   emit_operand32(rdi, adr);
3419 }
3420 
3421 void Assembler::fistp_s(Address adr) {
3422   InstructionMark im(this);
3423   emit_byte(0xDB);
3424   emit_operand32(rbx, adr);
3425 }
3426 
3427 void Assembler::fld1() {
3428   emit_byte(0xD9);
3429   emit_byte(0xE8);
3430 }
3431 
3432 void Assembler::fld_d(Address adr) {
3433   InstructionMark im(this);
3434   emit_byte(0xDD);
3435   emit_operand32(rax, adr);
3436 }
3437 
3438 void Assembler::fld_s(Address adr) {
3439   InstructionMark im(this);
3440   emit_byte(0xD9);
3441   emit_operand32(rax, adr);
3442 }
3443 
3444 
3445 void Assembler::fld_s(int index) {
3446   emit_farith(0xD9, 0xC0, index);
3447 }
3448 
3449 void Assembler::fld_x(Address adr) {
3450   InstructionMark im(this);
3451   emit_byte(0xDB);
3452   emit_operand32(rbp, adr);
3453 }
3454 
3455 void Assembler::fldcw(Address src) {
3456   InstructionMark im(this);
3457   emit_byte(0xd9);
3458   emit_operand32(rbp, src);
3459 }
3460 
3461 void Assembler::fldenv(Address src) {
3462   InstructionMark im(this);
3463   emit_byte(0xD9);
3464   emit_operand32(rsp, src);
3465 }
3466 
3467 void Assembler::fldlg2() {
3468   emit_byte(0xD9);
3469   emit_byte(0xEC);
3470 }
3471 
3472 void Assembler::fldln2() {
3473   emit_byte(0xD9);
3474   emit_byte(0xED);
3475 }
3476 
3477 void Assembler::fldz() {
3478   emit_byte(0xD9);
3479   emit_byte(0xEE);
3480 }
3481 
3482 void Assembler::flog() {
3483   fldln2();
3484   fxch();
3485   fyl2x();
3486 }
3487 
3488 void Assembler::flog10() {
3489   fldlg2();
3490   fxch();
3491   fyl2x();
3492 }
3493 
3494 void Assembler::fmul(int i) {
3495   emit_farith(0xD8, 0xC8, i);
3496 }
3497 
3498 void Assembler::fmul_d(Address src) {
3499   InstructionMark im(this);
3500   emit_byte(0xDC);
3501   emit_operand32(rcx, src);
3502 }
3503 
3504 void Assembler::fmul_s(Address src) {
3505   InstructionMark im(this);
3506   emit_byte(0xD8);
3507   emit_operand32(rcx, src);
3508 }
3509 
3510 void Assembler::fmula(int i) {
3511   emit_farith(0xDC, 0xC8, i);
3512 }
3513 
3514 void Assembler::fmulp(int i) {
3515   emit_farith(0xDE, 0xC8, i);
3516 }
3517 
3518 void Assembler::fnsave(Address dst) {
3519   InstructionMark im(this);
3520   emit_byte(0xDD);
3521   emit_operand32(rsi, dst);
3522 }
3523 
3524 void Assembler::fnstcw(Address src) {
3525   InstructionMark im(this);
3526   emit_byte(0x9B);
3527   emit_byte(0xD9);
3528   emit_operand32(rdi, src);
3529 }
3530 
3531 void Assembler::fnstsw_ax() {
3532   emit_byte(0xdF);
3533   emit_byte(0xE0);
3534 }
3535 
3536 void Assembler::fprem() {
3537   emit_byte(0xD9);
3538   emit_byte(0xF8);
3539 }
3540 
3541 void Assembler::fprem1() {
3542   emit_byte(0xD9);
3543   emit_byte(0xF5);
3544 }
3545 
3546 void Assembler::frstor(Address src) {
3547   InstructionMark im(this);
3548   emit_byte(0xDD);
3549   emit_operand32(rsp, src);
3550 }
3551 
3552 void Assembler::fsin() {
3553   emit_byte(0xD9);
3554   emit_byte(0xFE);
3555 }
3556 
3557 void Assembler::fsqrt() {
3558   emit_byte(0xD9);
3559   emit_byte(0xFA);
3560 }
3561 
3562 void Assembler::fst_d(Address adr) {
3563   InstructionMark im(this);
3564   emit_byte(0xDD);
3565   emit_operand32(rdx, adr);
3566 }
3567 
3568 void Assembler::fst_s(Address adr) {
3569   InstructionMark im(this);
3570   emit_byte(0xD9);
3571   emit_operand32(rdx, adr);
3572 }
3573 
3574 void Assembler::fstp_d(Address adr) {
3575   InstructionMark im(this);
3576   emit_byte(0xDD);
3577   emit_operand32(rbx, adr);
3578 }
3579 
3580 void Assembler::fstp_d(int index) {
3581   emit_farith(0xDD, 0xD8, index);
3582 }
3583 
3584 void Assembler::fstp_s(Address adr) {
3585   InstructionMark im(this);
3586   emit_byte(0xD9);
3587   emit_operand32(rbx, adr);
3588 }
3589 
3590 void Assembler::fstp_x(Address adr) {
3591   InstructionMark im(this);
3592   emit_byte(0xDB);
3593   emit_operand32(rdi, adr);
3594 }
3595 
3596 void Assembler::fsub(int i) {
3597   emit_farith(0xD8, 0xE0, i);
3598 }
3599 
3600 void Assembler::fsub_d(Address src) {
3601   InstructionMark im(this);
3602   emit_byte(0xDC);
3603   emit_operand32(rsp, src);
3604 }
3605 
3606 void Assembler::fsub_s(Address src) {
3607   InstructionMark im(this);
3608   emit_byte(0xD8);
3609   emit_operand32(rsp, src);
3610 }
3611 
3612 void Assembler::fsuba(int i) {
3613   emit_farith(0xDC, 0xE8, i);
3614 }
3615 
3616 void Assembler::fsubp(int i) {
3617   emit_farith(0xDE, 0xE8, i);                    // ST(0) <- ST(0) - ST(1) and pop (Intel manual wrong)
3618 }
3619 
3620 void Assembler::fsubr(int i) {
3621   emit_farith(0xD8, 0xE8, i);
3622 }
3623 
3624 void Assembler::fsubr_d(Address src) {
3625   InstructionMark im(this);
3626   emit_byte(0xDC);
3627   emit_operand32(rbp, src);
3628 }
3629 
3630 void Assembler::fsubr_s(Address src) {
3631   InstructionMark im(this);
3632   emit_byte(0xD8);
3633   emit_operand32(rbp, src);
3634 }
3635 
3636 void Assembler::fsubra(int i) {
3637   emit_farith(0xDC, 0xE0, i);
3638 }
3639 
3640 void Assembler::fsubrp(int i) {
3641   emit_farith(0xDE, 0xE0, i);                    // ST(0) <- ST(1) - ST(0) and pop (Intel manual wrong)
3642 }
3643 
3644 void Assembler::ftan() {
3645   emit_byte(0xD9);
3646   emit_byte(0xF2);
3647   emit_byte(0xDD);
3648   emit_byte(0xD8);
3649 }
3650 
3651 void Assembler::ftst() {
3652   emit_byte(0xD9);
3653   emit_byte(0xE4);
3654 }
3655 
3656 void Assembler::fucomi(int i) {
3657   // make sure the instruction is supported (introduced for P6, together with cmov)
3658   guarantee(VM_Version::supports_cmov(), "illegal instruction");
3659   emit_farith(0xDB, 0xE8, i);
3660 }
3661 
3662 void Assembler::fucomip(int i) {
3663   // make sure the instruction is supported (introduced for P6, together with cmov)
3664   guarantee(VM_Version::supports_cmov(), "illegal instruction");
3665   emit_farith(0xDF, 0xE8, i);
3666 }
3667 
3668 void Assembler::fwait() {
3669   emit_byte(0x9B);
3670 }
3671 
3672 void Assembler::fxch(int i) {
3673   emit_farith(0xD9, 0xC8, i);
3674 }
3675 
3676 void Assembler::fyl2x() {
3677   emit_byte(0xD9);
3678   emit_byte(0xF1);
3679 }
3680 
3681 void Assembler::frndint() {
3682   emit_byte(0xD9);
3683   emit_byte(0xFC);
3684 }
3685 
3686 void Assembler::f2xm1() {
3687   emit_byte(0xD9);
3688   emit_byte(0xF0);
3689 }
3690 
3691 void Assembler::fldl2e() {
3692   emit_byte(0xD9);
3693   emit_byte(0xEA);
3694 }
3695 
3696 // SSE SIMD prefix byte values corresponding to VexSimdPrefix encoding.
3697 static int simd_pre[4] = { 0, 0x66, 0xF3, 0xF2 };
3698 // SSE opcode second byte values (first is 0x0F) corresponding to VexOpcode encoding.
3699 static int simd_opc[4] = { 0,    0, 0x38, 0x3A };
3700 
3701 // Generate SSE legacy REX prefix and SIMD opcode based on VEX encoding.
3702 void Assembler::rex_prefix(Address adr, XMMRegister xreg, VexSimdPrefix pre, VexOpcode opc, bool rex_w) {
3703   if (pre > 0) {
3704     emit_byte(simd_pre[pre]);
3705   }
3706   if (rex_w) {
3707     prefixq(adr, xreg);
3708   } else {
3709     prefix(adr, xreg);
3710   }
3711   if (opc > 0) {
3712     emit_byte(0x0F);
3713     int opc2 = simd_opc[opc];
3714     if (opc2 > 0) {
3715       emit_byte(opc2);
3716     }
3717   }
3718 }
3719 
3720 int Assembler::rex_prefix_and_encode(int dst_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc, bool rex_w) {
3721   if (pre > 0) {
3722     emit_byte(simd_pre[pre]);
3723   }
3724   int encode = (rex_w) ? prefixq_and_encode(dst_enc, src_enc) :
3725                           prefix_and_encode(dst_enc, src_enc);
3726   if (opc > 0) {
3727     emit_byte(0x0F);
3728     int opc2 = simd_opc[opc];
3729     if (opc2 > 0) {
3730       emit_byte(opc2);
3731     }
3732   }
3733   return encode;
3734 }
3735 
3736 
3737 void Assembler::vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w, int nds_enc, VexSimdPrefix pre, VexOpcode opc, bool vector256) {
3738   if (vex_b || vex_x || vex_w || (opc == VEX_OPCODE_0F_38) || (opc == VEX_OPCODE_0F_3A)) {
3739     prefix(VEX_3bytes);
3740 
3741     int byte1 = (vex_r ? VEX_R : 0) | (vex_x ? VEX_X : 0) | (vex_b ? VEX_B : 0);
3742     byte1 = (~byte1) & 0xE0;
3743     byte1 |= opc;
3744     a_byte(byte1);
3745 
3746     int byte2 = ((~nds_enc) & 0xf) << 3;
3747     byte2 |= (vex_w ? VEX_W : 0) | (vector256 ? 4 : 0) | pre;
3748     emit_byte(byte2);
3749   } else {
3750     prefix(VEX_2bytes);
3751 
3752     int byte1 = vex_r ? VEX_R : 0;
3753     byte1 = (~byte1) & 0x80;
3754     byte1 |= ((~nds_enc) & 0xf) << 3;
3755     byte1 |= (vector256 ? 4 : 0) | pre;
3756     emit_byte(byte1);
3757   }
3758 }
3759 
3760 void Assembler::vex_prefix(Address adr, int nds_enc, int xreg_enc, VexSimdPrefix pre, VexOpcode opc, bool vex_w, bool vector256){
3761   bool vex_r = (xreg_enc >= 8);
3762   bool vex_b = adr.base_needs_rex();
3763   bool vex_x = adr.index_needs_rex();
3764   vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector256);
3765 }
3766 
3767 int Assembler::vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc, bool vex_w, bool vector256) {
3768   bool vex_r = (dst_enc >= 8);
3769   bool vex_b = (src_enc >= 8);
3770   bool vex_x = false;
3771   vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector256);
3772   return (((dst_enc & 7) << 3) | (src_enc & 7));
3773 }
3774 
3775 
3776 void Assembler::simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr, VexSimdPrefix pre, VexOpcode opc, bool rex_w, bool vector256) {
3777   if (UseAVX > 0) {
3778     int xreg_enc = xreg->encoding();
3779     int  nds_enc = nds->is_valid() ? nds->encoding() : 0;
3780     vex_prefix(adr, nds_enc, xreg_enc, pre, opc, rex_w, vector256);
3781   } else {
3782     assert((nds == xreg) || (nds == xnoreg), "wrong sse encoding");
3783     rex_prefix(adr, xreg, pre, opc, rex_w);
3784   }
3785 }
3786 
3787 int Assembler::simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, VexSimdPrefix pre, VexOpcode opc, bool rex_w, bool vector256) {
3788   int dst_enc = dst->encoding();
3789   int src_enc = src->encoding();
3790   if (UseAVX > 0) {
3791     int nds_enc = nds->is_valid() ? nds->encoding() : 0;
3792     return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, rex_w, vector256);
3793   } else {
3794     assert((nds == dst) || (nds == src) || (nds == xnoreg), "wrong sse encoding");
3795     return rex_prefix_and_encode(dst_enc, src_enc, pre, opc, rex_w);
3796   }
3797 }
3798 
3799 #ifndef _LP64
3800 
3801 void Assembler::incl(Register dst) {
3802   // Don't use it directly. Use MacroAssembler::incrementl() instead.
3803   emit_byte(0x40 | dst->encoding());
3804 }
3805 
3806 void Assembler::lea(Register dst, Address src) {
3807   leal(dst, src);
3808 }
3809 
3810 void Assembler::mov_literal32(Address dst, int32_t imm32,  RelocationHolder const& rspec) {
3811   InstructionMark im(this);
3812   emit_byte(0xC7);
3813   emit_operand(rax, dst);
3814   emit_data((int)imm32, rspec, 0);
3815 }
3816 
3817 void Assembler::mov_literal32(Register dst, int32_t imm32, RelocationHolder const& rspec) {
3818   InstructionMark im(this);
3819   int encode = prefix_and_encode(dst->encoding());
3820   emit_byte(0xB8 | encode);
3821   emit_data((int)imm32, rspec, 0);
3822 }
3823 
3824 void Assembler::popa() { // 32bit
3825   emit_byte(0x61);
3826 }
3827 
3828 void Assembler::push_literal32(int32_t imm32, RelocationHolder const& rspec) {
3829   InstructionMark im(this);
3830   emit_byte(0x68);
3831   emit_data(imm32, rspec, 0);
3832 }
3833 
3834 void Assembler::pusha() { // 32bit
3835   emit_byte(0x60);
3836 }
3837 
3838 void Assembler::set_byte_if_not_zero(Register dst) {
3839   emit_byte(0x0F);
3840   emit_byte(0x95);
3841   emit_byte(0xE0 | dst->encoding());
3842 }
3843 
3844 void Assembler::shldl(Register dst, Register src) {
3845   emit_byte(0x0F);
3846   emit_byte(0xA5);
3847   emit_byte(0xC0 | src->encoding() << 3 | dst->encoding());
3848 }
3849 
3850 void Assembler::shrdl(Register dst, Register src) {
3851   emit_byte(0x0F);
3852   emit_byte(0xAD);
3853   emit_byte(0xC0 | src->encoding() << 3 | dst->encoding());
3854 }
3855 
3856 #else // LP64
3857 
3858 void Assembler::set_byte_if_not_zero(Register dst) {
3859   int enc = prefix_and_encode(dst->encoding(), true);
3860   emit_byte(0x0F);
3861   emit_byte(0x95);
3862   emit_byte(0xE0 | enc);
3863 }
3864 
3865 // 64bit only pieces of the assembler
3866 // This should only be used by 64bit instructions that can use rip-relative
3867 // it cannot be used by instructions that want an immediate value.
3868 
3869 bool Assembler::reachable(AddressLiteral adr) {
3870   int64_t disp;
3871   // None will force a 64bit literal to the code stream. Likely a placeholder
3872   // for something that will be patched later and we need to certain it will
3873   // always be reachable.
3874   if (adr.reloc() == relocInfo::none) {
3875     return false;
3876   }
3877   if (adr.reloc() == relocInfo::internal_word_type) {
3878     // This should be rip relative and easily reachable.
3879     return true;
3880   }
3881   if (adr.reloc() == relocInfo::virtual_call_type ||
3882       adr.reloc() == relocInfo::opt_virtual_call_type ||
3883       adr.reloc() == relocInfo::static_call_type ||
3884       adr.reloc() == relocInfo::static_stub_type ) {
3885     // This should be rip relative within the code cache and easily
3886     // reachable until we get huge code caches. (At which point
3887     // ic code is going to have issues).
3888     return true;
3889   }
3890   if (adr.reloc() != relocInfo::external_word_type &&
3891       adr.reloc() != relocInfo::poll_return_type &&  // these are really external_word but need special
3892       adr.reloc() != relocInfo::poll_type &&         // relocs to identify them
3893       adr.reloc() != relocInfo::runtime_call_type ) {
3894     return false;
3895   }
3896 
3897   // Stress the correction code
3898   if (ForceUnreachable) {
3899     // Must be runtimecall reloc, see if it is in the codecache
3900     // Flipping stuff in the codecache to be unreachable causes issues
3901     // with things like inline caches where the additional instructions
3902     // are not handled.
3903     if (CodeCache::find_blob(adr._target) == NULL) {
3904       return false;
3905     }
3906   }
3907   // For external_word_type/runtime_call_type if it is reachable from where we
3908   // are now (possibly a temp buffer) and where we might end up
3909   // anywhere in the codeCache then we are always reachable.
3910   // This would have to change if we ever save/restore shared code
3911   // to be more pessimistic.
3912   disp = (int64_t)adr._target - ((int64_t)CodeCache::low_bound() + sizeof(int));
3913   if (!is_simm32(disp)) return false;
3914   disp = (int64_t)adr._target - ((int64_t)CodeCache::high_bound() + sizeof(int));
3915   if (!is_simm32(disp)) return false;
3916 
3917   disp = (int64_t)adr._target - ((int64_t)_code_pos + sizeof(int));
3918 
3919   // Because rip relative is a disp + address_of_next_instruction and we
3920   // don't know the value of address_of_next_instruction we apply a fudge factor
3921   // to make sure we will be ok no matter the size of the instruction we get placed into.
3922   // We don't have to fudge the checks above here because they are already worst case.
3923 
3924   // 12 == override/rex byte, opcode byte, rm byte, sib byte, a 4-byte disp , 4-byte literal
3925   // + 4 because better safe than sorry.
3926   const int fudge = 12 + 4;
3927   if (disp < 0) {
3928     disp -= fudge;
3929   } else {
3930     disp += fudge;
3931   }
3932   return is_simm32(disp);
3933 }
3934 
3935 // Check if the polling page is not reachable from the code cache using rip-relative
3936 // addressing.
3937 bool Assembler::is_polling_page_far() {
3938   intptr_t addr = (intptr_t)os::get_polling_page();
3939   return ForceUnreachable ||
3940          !is_simm32(addr - (intptr_t)CodeCache::low_bound()) ||
3941          !is_simm32(addr - (intptr_t)CodeCache::high_bound());
3942 }
3943 
3944 void Assembler::emit_data64(jlong data,
3945                             relocInfo::relocType rtype,
3946                             int format) {
3947   if (rtype == relocInfo::none) {
3948     emit_long64(data);
3949   } else {
3950     emit_data64(data, Relocation::spec_simple(rtype), format);
3951   }
3952 }
3953 
3954 void Assembler::emit_data64(jlong data,
3955                             RelocationHolder const& rspec,
3956                             int format) {
3957   assert(imm_operand == 0, "default format must be immediate in this file");
3958   assert(imm_operand == format, "must be immediate");
3959   assert(inst_mark() != NULL, "must be inside InstructionMark");
3960   // Do not use AbstractAssembler::relocate, which is not intended for
3961   // embedded words.  Instead, relocate to the enclosing instruction.
3962   code_section()->relocate(inst_mark(), rspec, format);
3963 #ifdef ASSERT
3964   check_relocation(rspec, format);
3965 #endif
3966   emit_long64(data);
3967 }
3968 
3969 int Assembler::prefix_and_encode(int reg_enc, bool byteinst) {
3970   if (reg_enc >= 8) {
3971     prefix(REX_B);
3972     reg_enc -= 8;
3973   } else if (byteinst && reg_enc >= 4) {
3974     prefix(REX);
3975   }
3976   return reg_enc;
3977 }
3978 
3979 int Assembler::prefixq_and_encode(int reg_enc) {
3980   if (reg_enc < 8) {
3981     prefix(REX_W);
3982   } else {
3983     prefix(REX_WB);
3984     reg_enc -= 8;
3985   }
3986   return reg_enc;
3987 }
3988 
3989 int Assembler::prefix_and_encode(int dst_enc, int src_enc, bool byteinst) {
3990   if (dst_enc < 8) {
3991     if (src_enc >= 8) {
3992       prefix(REX_B);
3993       src_enc -= 8;
3994     } else if (byteinst && src_enc >= 4) {
3995       prefix(REX);
3996     }
3997   } else {
3998     if (src_enc < 8) {
3999       prefix(REX_R);
4000     } else {
4001       prefix(REX_RB);
4002       src_enc -= 8;
4003     }
4004     dst_enc -= 8;
4005   }
4006   return dst_enc << 3 | src_enc;
4007 }
4008 
4009 int Assembler::prefixq_and_encode(int dst_enc, int src_enc) {
4010   if (dst_enc < 8) {
4011     if (src_enc < 8) {
4012       prefix(REX_W);
4013     } else {
4014       prefix(REX_WB);
4015       src_enc -= 8;
4016     }
4017   } else {
4018     if (src_enc < 8) {
4019       prefix(REX_WR);
4020     } else {
4021       prefix(REX_WRB);
4022       src_enc -= 8;
4023     }
4024     dst_enc -= 8;
4025   }
4026   return dst_enc << 3 | src_enc;
4027 }
4028 
4029 void Assembler::prefix(Register reg) {
4030   if (reg->encoding() >= 8) {
4031     prefix(REX_B);
4032   }
4033 }
4034 
4035 void Assembler::prefix(Address adr) {
4036   if (adr.base_needs_rex()) {
4037     if (adr.index_needs_rex()) {
4038       prefix(REX_XB);
4039     } else {
4040       prefix(REX_B);
4041     }
4042   } else {
4043     if (adr.index_needs_rex()) {
4044       prefix(REX_X);
4045     }
4046   }
4047 }
4048 
4049 void Assembler::prefixq(Address adr) {
4050   if (adr.base_needs_rex()) {
4051     if (adr.index_needs_rex()) {
4052       prefix(REX_WXB);
4053     } else {
4054       prefix(REX_WB);
4055     }
4056   } else {
4057     if (adr.index_needs_rex()) {
4058       prefix(REX_WX);
4059     } else {
4060       prefix(REX_W);
4061     }
4062   }
4063 }
4064 
4065 
4066 void Assembler::prefix(Address adr, Register reg, bool byteinst) {
4067   if (reg->encoding() < 8) {
4068     if (adr.base_needs_rex()) {
4069       if (adr.index_needs_rex()) {
4070         prefix(REX_XB);
4071       } else {
4072         prefix(REX_B);
4073       }
4074     } else {
4075       if (adr.index_needs_rex()) {
4076         prefix(REX_X);
4077       } else if (byteinst && reg->encoding() >= 4 ) {
4078         prefix(REX);
4079       }
4080     }
4081   } else {
4082     if (adr.base_needs_rex()) {
4083       if (adr.index_needs_rex()) {
4084         prefix(REX_RXB);
4085       } else {
4086         prefix(REX_RB);
4087       }
4088     } else {
4089       if (adr.index_needs_rex()) {
4090         prefix(REX_RX);
4091       } else {
4092         prefix(REX_R);
4093       }
4094     }
4095   }
4096 }
4097 
4098 void Assembler::prefixq(Address adr, Register src) {
4099   if (src->encoding() < 8) {
4100     if (adr.base_needs_rex()) {
4101       if (adr.index_needs_rex()) {
4102         prefix(REX_WXB);
4103       } else {
4104         prefix(REX_WB);
4105       }
4106     } else {
4107       if (adr.index_needs_rex()) {
4108         prefix(REX_WX);
4109       } else {
4110         prefix(REX_W);
4111       }
4112     }
4113   } else {
4114     if (adr.base_needs_rex()) {
4115       if (adr.index_needs_rex()) {
4116         prefix(REX_WRXB);
4117       } else {
4118         prefix(REX_WRB);
4119       }
4120     } else {
4121       if (adr.index_needs_rex()) {
4122         prefix(REX_WRX);
4123       } else {
4124         prefix(REX_WR);
4125       }
4126     }
4127   }
4128 }
4129 
4130 void Assembler::prefix(Address adr, XMMRegister reg) {
4131   if (reg->encoding() < 8) {
4132     if (adr.base_needs_rex()) {
4133       if (adr.index_needs_rex()) {
4134         prefix(REX_XB);
4135       } else {
4136         prefix(REX_B);
4137       }
4138     } else {
4139       if (adr.index_needs_rex()) {
4140         prefix(REX_X);
4141       }
4142     }
4143   } else {
4144     if (adr.base_needs_rex()) {
4145       if (adr.index_needs_rex()) {
4146         prefix(REX_RXB);
4147       } else {
4148         prefix(REX_RB);
4149       }
4150     } else {
4151       if (adr.index_needs_rex()) {
4152         prefix(REX_RX);
4153       } else {
4154         prefix(REX_R);
4155       }
4156     }
4157   }
4158 }
4159 
4160 void Assembler::prefixq(Address adr, XMMRegister src) {
4161   if (src->encoding() < 8) {
4162     if (adr.base_needs_rex()) {
4163       if (adr.index_needs_rex()) {
4164         prefix(REX_WXB);
4165       } else {
4166         prefix(REX_WB);
4167       }
4168     } else {
4169       if (adr.index_needs_rex()) {
4170         prefix(REX_WX);
4171       } else {
4172         prefix(REX_W);
4173       }
4174     }
4175   } else {
4176     if (adr.base_needs_rex()) {
4177       if (adr.index_needs_rex()) {
4178         prefix(REX_WRXB);
4179       } else {
4180         prefix(REX_WRB);
4181       }
4182     } else {
4183       if (adr.index_needs_rex()) {
4184         prefix(REX_WRX);
4185       } else {
4186         prefix(REX_WR);
4187       }
4188     }
4189   }
4190 }
4191 
4192 void Assembler::adcq(Register dst, int32_t imm32) {
4193   (void) prefixq_and_encode(dst->encoding());
4194   emit_arith(0x81, 0xD0, dst, imm32);
4195 }
4196 
4197 void Assembler::adcq(Register dst, Address src) {
4198   InstructionMark im(this);
4199   prefixq(src, dst);
4200   emit_byte(0x13);
4201   emit_operand(dst, src);
4202 }
4203 
4204 void Assembler::adcq(Register dst, Register src) {
4205   (int) prefixq_and_encode(dst->encoding(), src->encoding());
4206   emit_arith(0x13, 0xC0, dst, src);
4207 }
4208 
4209 void Assembler::addq(Address dst, int32_t imm32) {
4210   InstructionMark im(this);
4211   prefixq(dst);
4212   emit_arith_operand(0x81, rax, dst,imm32);
4213 }
4214 
4215 void Assembler::addq(Address dst, Register src) {
4216   InstructionMark im(this);
4217   prefixq(dst, src);
4218   emit_byte(0x01);
4219   emit_operand(src, dst);
4220 }
4221 
4222 void Assembler::addq(Register dst, int32_t imm32) {
4223   (void) prefixq_and_encode(dst->encoding());
4224   emit_arith(0x81, 0xC0, dst, imm32);
4225 }
4226 
4227 void Assembler::addq(Register dst, Address src) {
4228   InstructionMark im(this);
4229   prefixq(src, dst);
4230   emit_byte(0x03);
4231   emit_operand(dst, src);
4232 }
4233 
4234 void Assembler::addq(Register dst, Register src) {
4235   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4236   emit_arith(0x03, 0xC0, dst, src);
4237 }
4238 
4239 void Assembler::andq(Address dst, int32_t imm32) {
4240   InstructionMark im(this);
4241   prefixq(dst);
4242   emit_byte(0x81);
4243   emit_operand(rsp, dst, 4);
4244   emit_long(imm32);
4245 }
4246 
4247 void Assembler::andq(Register dst, int32_t imm32) {
4248   (void) prefixq_and_encode(dst->encoding());
4249   emit_arith(0x81, 0xE0, dst, imm32);
4250 }
4251 
4252 void Assembler::andq(Register dst, Address src) {
4253   InstructionMark im(this);
4254   prefixq(src, dst);
4255   emit_byte(0x23);
4256   emit_operand(dst, src);
4257 }
4258 
4259 void Assembler::andq(Register dst, Register src) {
4260   (int) prefixq_and_encode(dst->encoding(), src->encoding());
4261   emit_arith(0x23, 0xC0, dst, src);
4262 }
4263 
4264 void Assembler::bsfq(Register dst, Register src) {
4265   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4266   emit_byte(0x0F);
4267   emit_byte(0xBC);
4268   emit_byte(0xC0 | encode);
4269 }
4270 
4271 void Assembler::bsrq(Register dst, Register src) {
4272   assert(!VM_Version::supports_lzcnt(), "encoding is treated as LZCNT");
4273   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4274   emit_byte(0x0F);
4275   emit_byte(0xBD);
4276   emit_byte(0xC0 | encode);
4277 }
4278 
4279 void Assembler::bswapq(Register reg) {
4280   int encode = prefixq_and_encode(reg->encoding());
4281   emit_byte(0x0F);
4282   emit_byte(0xC8 | encode);
4283 }
4284 
4285 void Assembler::cdqq() {
4286   prefix(REX_W);
4287   emit_byte(0x99);
4288 }
4289 
4290 void Assembler::clflush(Address adr) {
4291   prefix(adr);
4292   emit_byte(0x0F);
4293   emit_byte(0xAE);
4294   emit_operand(rdi, adr);
4295 }
4296 
4297 void Assembler::cmovq(Condition cc, Register dst, Register src) {
4298   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4299   emit_byte(0x0F);
4300   emit_byte(0x40 | cc);
4301   emit_byte(0xC0 | encode);
4302 }
4303 
4304 void Assembler::cmovq(Condition cc, Register dst, Address src) {
4305   InstructionMark im(this);
4306   prefixq(src, dst);
4307   emit_byte(0x0F);
4308   emit_byte(0x40 | cc);
4309   emit_operand(dst, src);
4310 }
4311 
4312 void Assembler::cmpq(Address dst, int32_t imm32) {
4313   InstructionMark im(this);
4314   prefixq(dst);
4315   emit_byte(0x81);
4316   emit_operand(rdi, dst, 4);
4317   emit_long(imm32);
4318 }
4319 
4320 void Assembler::cmpq(Register dst, int32_t imm32) {
4321   (void) prefixq_and_encode(dst->encoding());
4322   emit_arith(0x81, 0xF8, dst, imm32);
4323 }
4324 
4325 void Assembler::cmpq(Address dst, Register src) {
4326   InstructionMark im(this);
4327   prefixq(dst, src);
4328   emit_byte(0x3B);
4329   emit_operand(src, dst);
4330 }
4331 
4332 void Assembler::cmpq(Register dst, Register src) {
4333   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4334   emit_arith(0x3B, 0xC0, dst, src);
4335 }
4336 
4337 void Assembler::cmpq(Register dst, Address  src) {
4338   InstructionMark im(this);
4339   prefixq(src, dst);
4340   emit_byte(0x3B);
4341   emit_operand(dst, src);
4342 }
4343 
4344 void Assembler::cmpxchgq(Register reg, Address adr) {
4345   InstructionMark im(this);
4346   prefixq(adr, reg);
4347   emit_byte(0x0F);
4348   emit_byte(0xB1);
4349   emit_operand(reg, adr);
4350 }
4351 
4352 void Assembler::cvtsi2sdq(XMMRegister dst, Register src) {
4353   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4354   int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F2);
4355   emit_byte(0x2A);
4356   emit_byte(0xC0 | encode);
4357 }
4358 
4359 void Assembler::cvtsi2sdq(XMMRegister dst, Address src) {
4360   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4361   InstructionMark im(this);
4362   simd_prefix_q(dst, dst, src, VEX_SIMD_F2);
4363   emit_byte(0x2A);
4364   emit_operand(dst, src);
4365 }
4366 
4367 void Assembler::cvtsi2ssq(XMMRegister dst, Register src) {
4368   NOT_LP64(assert(VM_Version::supports_sse(), ""));
4369   int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3);
4370   emit_byte(0x2A);
4371   emit_byte(0xC0 | encode);
4372 }
4373 
4374 void Assembler::cvtsi2ssq(XMMRegister dst, Address src) {
4375   NOT_LP64(assert(VM_Version::supports_sse(), ""));
4376   InstructionMark im(this);
4377   simd_prefix_q(dst, dst, src, VEX_SIMD_F3);
4378   emit_byte(0x2A);
4379   emit_operand(dst, src);
4380 }
4381 
4382 void Assembler::cvttsd2siq(Register dst, XMMRegister src) {
4383   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4384   int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F2);
4385   emit_byte(0x2C);
4386   emit_byte(0xC0 | encode);
4387 }
4388 
4389 void Assembler::cvttss2siq(Register dst, XMMRegister src) {
4390   NOT_LP64(assert(VM_Version::supports_sse(), ""));
4391   int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F3);
4392   emit_byte(0x2C);
4393   emit_byte(0xC0 | encode);
4394 }
4395 
4396 void Assembler::decl(Register dst) {
4397   // Don't use it directly. Use MacroAssembler::decrementl() instead.
4398   // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
4399   int encode = prefix_and_encode(dst->encoding());
4400   emit_byte(0xFF);
4401   emit_byte(0xC8 | encode);
4402 }
4403 
4404 void Assembler::decq(Register dst) {
4405   // Don't use it directly. Use MacroAssembler::decrementq() instead.
4406   // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
4407   int encode = prefixq_and_encode(dst->encoding());
4408   emit_byte(0xFF);
4409   emit_byte(0xC8 | encode);
4410 }
4411 
4412 void Assembler::decq(Address dst) {
4413   // Don't use it directly. Use MacroAssembler::decrementq() instead.
4414   InstructionMark im(this);
4415   prefixq(dst);
4416   emit_byte(0xFF);
4417   emit_operand(rcx, dst);
4418 }
4419 
4420 void Assembler::fxrstor(Address src) {
4421   prefixq(src);
4422   emit_byte(0x0F);
4423   emit_byte(0xAE);
4424   emit_operand(as_Register(1), src);
4425 }
4426 
4427 void Assembler::fxsave(Address dst) {
4428   prefixq(dst);
4429   emit_byte(0x0F);
4430   emit_byte(0xAE);
4431   emit_operand(as_Register(0), dst);
4432 }
4433 
4434 void Assembler::idivq(Register src) {
4435   int encode = prefixq_and_encode(src->encoding());
4436   emit_byte(0xF7);
4437   emit_byte(0xF8 | encode);
4438 }
4439 
4440 void Assembler::imulq(Register dst, Register src) {
4441   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4442   emit_byte(0x0F);
4443   emit_byte(0xAF);
4444   emit_byte(0xC0 | encode);
4445 }
4446 
4447 void Assembler::imulq(Register dst, Register src, int value) {
4448   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4449   if (is8bit(value)) {
4450     emit_byte(0x6B);
4451     emit_byte(0xC0 | encode);
4452     emit_byte(value & 0xFF);
4453   } else {
4454     emit_byte(0x69);
4455     emit_byte(0xC0 | encode);
4456     emit_long(value);
4457   }
4458 }
4459 
4460 void Assembler::incl(Register dst) {
4461   // Don't use it directly. Use MacroAssembler::incrementl() instead.
4462   // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
4463   int encode = prefix_and_encode(dst->encoding());
4464   emit_byte(0xFF);
4465   emit_byte(0xC0 | encode);
4466 }
4467 
4468 void Assembler::incq(Register dst) {
4469   // Don't use it directly. Use MacroAssembler::incrementq() instead.
4470   // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
4471   int encode = prefixq_and_encode(dst->encoding());
4472   emit_byte(0xFF);
4473   emit_byte(0xC0 | encode);
4474 }
4475 
4476 void Assembler::incq(Address dst) {
4477   // Don't use it directly. Use MacroAssembler::incrementq() instead.
4478   InstructionMark im(this);
4479   prefixq(dst);
4480   emit_byte(0xFF);
4481   emit_operand(rax, dst);
4482 }
4483 
4484 void Assembler::lea(Register dst, Address src) {
4485   leaq(dst, src);
4486 }
4487 
4488 void Assembler::leaq(Register dst, Address src) {
4489   InstructionMark im(this);
4490   prefixq(src, dst);
4491   emit_byte(0x8D);
4492   emit_operand(dst, src);
4493 }
4494 
4495 void Assembler::mov64(Register dst, int64_t imm64) {
4496   InstructionMark im(this);
4497   int encode = prefixq_and_encode(dst->encoding());
4498   emit_byte(0xB8 | encode);
4499   emit_long64(imm64);
4500 }
4501 
4502 void Assembler::mov_literal64(Register dst, intptr_t imm64, RelocationHolder const& rspec) {
4503   InstructionMark im(this);
4504   int encode = prefixq_and_encode(dst->encoding());
4505   emit_byte(0xB8 | encode);
4506   emit_data64(imm64, rspec);
4507 }
4508 
4509 void Assembler::mov_narrow_oop(Register dst, int32_t imm32, RelocationHolder const& rspec) {
4510   InstructionMark im(this);
4511   int encode = prefix_and_encode(dst->encoding());
4512   emit_byte(0xB8 | encode);
4513   emit_data((int)imm32, rspec, narrow_oop_operand);
4514 }
4515 
4516 void Assembler::mov_narrow_oop(Address dst, int32_t imm32,  RelocationHolder const& rspec) {
4517   InstructionMark im(this);
4518   prefix(dst);
4519   emit_byte(0xC7);
4520   emit_operand(rax, dst, 4);
4521   emit_data((int)imm32, rspec, narrow_oop_operand);
4522 }
4523 
4524 void Assembler::cmp_narrow_oop(Register src1, int32_t imm32, RelocationHolder const& rspec) {
4525   InstructionMark im(this);
4526   int encode = prefix_and_encode(src1->encoding());
4527   emit_byte(0x81);
4528   emit_byte(0xF8 | encode);
4529   emit_data((int)imm32, rspec, narrow_oop_operand);
4530 }
4531 
4532 void Assembler::cmp_narrow_oop(Address src1, int32_t imm32, RelocationHolder const& rspec) {
4533   InstructionMark im(this);
4534   prefix(src1);
4535   emit_byte(0x81);
4536   emit_operand(rax, src1, 4);
4537   emit_data((int)imm32, rspec, narrow_oop_operand);
4538 }
4539 
4540 void Assembler::lzcntq(Register dst, Register src) {
4541   assert(VM_Version::supports_lzcnt(), "encoding is treated as BSR");
4542   emit_byte(0xF3);
4543   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4544   emit_byte(0x0F);
4545   emit_byte(0xBD);
4546   emit_byte(0xC0 | encode);
4547 }
4548 
4549 void Assembler::movdq(XMMRegister dst, Register src) {
4550   // table D-1 says MMX/SSE2
4551   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4552   int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_66);
4553   emit_byte(0x6E);
4554   emit_byte(0xC0 | encode);
4555 }
4556 
4557 void Assembler::movdq(Register dst, XMMRegister src) {
4558   // table D-1 says MMX/SSE2
4559   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4560   // swap src/dst to get correct prefix
4561   int encode = simd_prefix_and_encode_q(src, dst, VEX_SIMD_66);
4562   emit_byte(0x7E);
4563   emit_byte(0xC0 | encode);
4564 }
4565 
4566 void Assembler::movq(Register dst, Register src) {
4567   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4568   emit_byte(0x8B);
4569   emit_byte(0xC0 | encode);
4570 }
4571 
4572 void Assembler::movq(Register dst, Address src) {
4573   InstructionMark im(this);
4574   prefixq(src, dst);
4575   emit_byte(0x8B);
4576   emit_operand(dst, src);
4577 }
4578 
4579 void Assembler::movq(Address dst, Register src) {
4580   InstructionMark im(this);
4581   prefixq(dst, src);
4582   emit_byte(0x89);
4583   emit_operand(src, dst);
4584 }
4585 
4586 void Assembler::movsbq(Register dst, Address src) {
4587   InstructionMark im(this);
4588   prefixq(src, dst);
4589   emit_byte(0x0F);
4590   emit_byte(0xBE);
4591   emit_operand(dst, src);
4592 }
4593 
4594 void Assembler::movsbq(Register dst, Register src) {
4595   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4596   emit_byte(0x0F);
4597   emit_byte(0xBE);
4598   emit_byte(0xC0 | encode);
4599 }
4600 
4601 void Assembler::movslq(Register dst, int32_t imm32) {
4602   // dbx shows movslq(rcx, 3) as movq     $0x0000000049000000,(%rbx)
4603   // and movslq(r8, 3); as movl     $0x0000000048000000,(%rbx)
4604   // as a result we shouldn't use until tested at runtime...
4605   ShouldNotReachHere();
4606   InstructionMark im(this);
4607   int encode = prefixq_and_encode(dst->encoding());
4608   emit_byte(0xC7 | encode);
4609   emit_long(imm32);
4610 }
4611 
4612 void Assembler::movslq(Address dst, int32_t imm32) {
4613   assert(is_simm32(imm32), "lost bits");
4614   InstructionMark im(this);
4615   prefixq(dst);
4616   emit_byte(0xC7);
4617   emit_operand(rax, dst, 4);
4618   emit_long(imm32);
4619 }
4620 
4621 void Assembler::movslq(Register dst, Address src) {
4622   InstructionMark im(this);
4623   prefixq(src, dst);
4624   emit_byte(0x63);
4625   emit_operand(dst, src);
4626 }
4627 
4628 void Assembler::movslq(Register dst, Register src) {
4629   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4630   emit_byte(0x63);
4631   emit_byte(0xC0 | encode);
4632 }
4633 
4634 void Assembler::movswq(Register dst, Address src) {
4635   InstructionMark im(this);
4636   prefixq(src, dst);
4637   emit_byte(0x0F);
4638   emit_byte(0xBF);
4639   emit_operand(dst, src);
4640 }
4641 
4642 void Assembler::movswq(Register dst, Register src) {
4643   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4644   emit_byte(0x0F);
4645   emit_byte(0xBF);
4646   emit_byte(0xC0 | encode);
4647 }
4648 
4649 void Assembler::movzbq(Register dst, Address src) {
4650   InstructionMark im(this);
4651   prefixq(src, dst);
4652   emit_byte(0x0F);
4653   emit_byte(0xB6);
4654   emit_operand(dst, src);
4655 }
4656 
4657 void Assembler::movzbq(Register dst, Register src) {
4658   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4659   emit_byte(0x0F);
4660   emit_byte(0xB6);
4661   emit_byte(0xC0 | encode);
4662 }
4663 
4664 void Assembler::movzwq(Register dst, Address src) {
4665   InstructionMark im(this);
4666   prefixq(src, dst);
4667   emit_byte(0x0F);
4668   emit_byte(0xB7);
4669   emit_operand(dst, src);
4670 }
4671 
4672 void Assembler::movzwq(Register dst, Register src) {
4673   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4674   emit_byte(0x0F);
4675   emit_byte(0xB7);
4676   emit_byte(0xC0 | encode);
4677 }
4678 
4679 void Assembler::negq(Register dst) {
4680   int encode = prefixq_and_encode(dst->encoding());
4681   emit_byte(0xF7);
4682   emit_byte(0xD8 | encode);
4683 }
4684 
4685 void Assembler::notq(Register dst) {
4686   int encode = prefixq_and_encode(dst->encoding());
4687   emit_byte(0xF7);
4688   emit_byte(0xD0 | encode);
4689 }
4690 
4691 void Assembler::orq(Address dst, int32_t imm32) {
4692   InstructionMark im(this);
4693   prefixq(dst);
4694   emit_byte(0x81);
4695   emit_operand(rcx, dst, 4);
4696   emit_long(imm32);
4697 }
4698 
4699 void Assembler::orq(Register dst, int32_t imm32) {
4700   (void) prefixq_and_encode(dst->encoding());
4701   emit_arith(0x81, 0xC8, dst, imm32);
4702 }
4703 
4704 void Assembler::orq(Register dst, Address src) {
4705   InstructionMark im(this);
4706   prefixq(src, dst);
4707   emit_byte(0x0B);
4708   emit_operand(dst, src);
4709 }
4710 
4711 void Assembler::orq(Register dst, Register src) {
4712   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4713   emit_arith(0x0B, 0xC0, dst, src);
4714 }
4715 
4716 void Assembler::popa() { // 64bit
4717   movq(r15, Address(rsp, 0));
4718   movq(r14, Address(rsp, wordSize));
4719   movq(r13, Address(rsp, 2 * wordSize));
4720   movq(r12, Address(rsp, 3 * wordSize));
4721   movq(r11, Address(rsp, 4 * wordSize));
4722   movq(r10, Address(rsp, 5 * wordSize));
4723   movq(r9,  Address(rsp, 6 * wordSize));
4724   movq(r8,  Address(rsp, 7 * wordSize));
4725   movq(rdi, Address(rsp, 8 * wordSize));
4726   movq(rsi, Address(rsp, 9 * wordSize));
4727   movq(rbp, Address(rsp, 10 * wordSize));
4728   // skip rsp
4729   movq(rbx, Address(rsp, 12 * wordSize));
4730   movq(rdx, Address(rsp, 13 * wordSize));
4731   movq(rcx, Address(rsp, 14 * wordSize));
4732   movq(rax, Address(rsp, 15 * wordSize));
4733 
4734   addq(rsp, 16 * wordSize);
4735 }
4736 
4737 void Assembler::popcntq(Register dst, Address src) {
4738   assert(VM_Version::supports_popcnt(), "must support");
4739   InstructionMark im(this);
4740   emit_byte(0xF3);
4741   prefixq(src, dst);
4742   emit_byte(0x0F);
4743   emit_byte(0xB8);
4744   emit_operand(dst, src);
4745 }
4746 
4747 void Assembler::popcntq(Register dst, Register src) {
4748   assert(VM_Version::supports_popcnt(), "must support");
4749   emit_byte(0xF3);
4750   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4751   emit_byte(0x0F);
4752   emit_byte(0xB8);
4753   emit_byte(0xC0 | encode);
4754 }
4755 
4756 void Assembler::popq(Address dst) {
4757   InstructionMark im(this);
4758   prefixq(dst);
4759   emit_byte(0x8F);
4760   emit_operand(rax, dst);
4761 }
4762 
4763 void Assembler::pusha() { // 64bit
4764   // we have to store original rsp.  ABI says that 128 bytes
4765   // below rsp are local scratch.
4766   movq(Address(rsp, -5 * wordSize), rsp);
4767 
4768   subq(rsp, 16 * wordSize);
4769 
4770   movq(Address(rsp, 15 * wordSize), rax);
4771   movq(Address(rsp, 14 * wordSize), rcx);
4772   movq(Address(rsp, 13 * wordSize), rdx);
4773   movq(Address(rsp, 12 * wordSize), rbx);
4774   // skip rsp
4775   movq(Address(rsp, 10 * wordSize), rbp);
4776   movq(Address(rsp, 9 * wordSize), rsi);
4777   movq(Address(rsp, 8 * wordSize), rdi);
4778   movq(Address(rsp, 7 * wordSize), r8);
4779   movq(Address(rsp, 6 * wordSize), r9);
4780   movq(Address(rsp, 5 * wordSize), r10);
4781   movq(Address(rsp, 4 * wordSize), r11);
4782   movq(Address(rsp, 3 * wordSize), r12);
4783   movq(Address(rsp, 2 * wordSize), r13);
4784   movq(Address(rsp, wordSize), r14);
4785   movq(Address(rsp, 0), r15);
4786 }
4787 
4788 void Assembler::pushq(Address src) {
4789   InstructionMark im(this);
4790   prefixq(src);
4791   emit_byte(0xFF);
4792   emit_operand(rsi, src);
4793 }
4794 
4795 void Assembler::rclq(Register dst, int imm8) {
4796   assert(isShiftCount(imm8 >> 1), "illegal shift count");
4797   int encode = prefixq_and_encode(dst->encoding());
4798   if (imm8 == 1) {
4799     emit_byte(0xD1);
4800     emit_byte(0xD0 | encode);
4801   } else {
4802     emit_byte(0xC1);
4803     emit_byte(0xD0 | encode);
4804     emit_byte(imm8);
4805   }
4806 }
4807 void Assembler::sarq(Register dst, int imm8) {
4808   assert(isShiftCount(imm8 >> 1), "illegal shift count");
4809   int encode = prefixq_and_encode(dst->encoding());
4810   if (imm8 == 1) {
4811     emit_byte(0xD1);
4812     emit_byte(0xF8 | encode);
4813   } else {
4814     emit_byte(0xC1);
4815     emit_byte(0xF8 | encode);
4816     emit_byte(imm8);
4817   }
4818 }
4819 
4820 void Assembler::sarq(Register dst) {
4821   int encode = prefixq_and_encode(dst->encoding());
4822   emit_byte(0xD3);
4823   emit_byte(0xF8 | encode);
4824 }
4825 
4826 void Assembler::sbbq(Address dst, int32_t imm32) {
4827   InstructionMark im(this);
4828   prefixq(dst);
4829   emit_arith_operand(0x81, rbx, dst, imm32);
4830 }
4831 
4832 void Assembler::sbbq(Register dst, int32_t imm32) {
4833   (void) prefixq_and_encode(dst->encoding());
4834   emit_arith(0x81, 0xD8, dst, imm32);
4835 }
4836 
4837 void Assembler::sbbq(Register dst, Address src) {
4838   InstructionMark im(this);
4839   prefixq(src, dst);
4840   emit_byte(0x1B);
4841   emit_operand(dst, src);
4842 }
4843 
4844 void Assembler::sbbq(Register dst, Register src) {
4845   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4846   emit_arith(0x1B, 0xC0, dst, src);
4847 }
4848 
4849 void Assembler::shlq(Register dst, int imm8) {
4850   assert(isShiftCount(imm8 >> 1), "illegal shift count");
4851   int encode = prefixq_and_encode(dst->encoding());
4852   if (imm8 == 1) {
4853     emit_byte(0xD1);
4854     emit_byte(0xE0 | encode);
4855   } else {
4856     emit_byte(0xC1);
4857     emit_byte(0xE0 | encode);
4858     emit_byte(imm8);
4859   }
4860 }
4861 
4862 void Assembler::shlq(Register dst) {
4863   int encode = prefixq_and_encode(dst->encoding());
4864   emit_byte(0xD3);
4865   emit_byte(0xE0 | encode);
4866 }
4867 
4868 void Assembler::shrq(Register dst, int imm8) {
4869   assert(isShiftCount(imm8 >> 1), "illegal shift count");
4870   int encode = prefixq_and_encode(dst->encoding());
4871   emit_byte(0xC1);
4872   emit_byte(0xE8 | encode);
4873   emit_byte(imm8);
4874 }
4875 
4876 void Assembler::shrq(Register dst) {
4877   int encode = prefixq_and_encode(dst->encoding());
4878   emit_byte(0xD3);
4879   emit_byte(0xE8 | encode);
4880 }
4881 
4882 void Assembler::subq(Address dst, int32_t imm32) {
4883   InstructionMark im(this);
4884   prefixq(dst);
4885   emit_arith_operand(0x81, rbp, dst, imm32);
4886 }
4887 
4888 void Assembler::subq(Address dst, Register src) {
4889   InstructionMark im(this);
4890   prefixq(dst, src);
4891   emit_byte(0x29);
4892   emit_operand(src, dst);
4893 }
4894 
4895 void Assembler::subq(Register dst, int32_t imm32) {
4896   (void) prefixq_and_encode(dst->encoding());
4897   emit_arith(0x81, 0xE8, dst, imm32);
4898 }
4899 
4900 // Force generation of a 4 byte immediate value even if it fits into 8bit
4901 void Assembler::subq_imm32(Register dst, int32_t imm32) {
4902   (void) prefixq_and_encode(dst->encoding());
4903   emit_arith_imm32(0x81, 0xE8, dst, imm32);
4904 }
4905 
4906 void Assembler::subq(Register dst, Address src) {
4907   InstructionMark im(this);
4908   prefixq(src, dst);
4909   emit_byte(0x2B);
4910   emit_operand(dst, src);
4911 }
4912 
4913 void Assembler::subq(Register dst, Register src) {
4914   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4915   emit_arith(0x2B, 0xC0, dst, src);
4916 }
4917 
4918 void Assembler::testq(Register dst, int32_t imm32) {
4919   // not using emit_arith because test
4920   // doesn't support sign-extension of
4921   // 8bit operands
4922   int encode = dst->encoding();
4923   if (encode == 0) {
4924     prefix(REX_W);
4925     emit_byte(0xA9);
4926   } else {
4927     encode = prefixq_and_encode(encode);
4928     emit_byte(0xF7);
4929     emit_byte(0xC0 | encode);
4930   }
4931   emit_long(imm32);
4932 }
4933 
4934 void Assembler::testq(Register dst, Register src) {
4935   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4936   emit_arith(0x85, 0xC0, dst, src);
4937 }
4938 
4939 void Assembler::xaddq(Address dst, Register src) {
4940   InstructionMark im(this);
4941   prefixq(dst, src);
4942   emit_byte(0x0F);
4943   emit_byte(0xC1);
4944   emit_operand(src, dst);
4945 }
4946 
4947 void Assembler::xchgq(Register dst, Address src) {
4948   InstructionMark im(this);
4949   prefixq(src, dst);
4950   emit_byte(0x87);
4951   emit_operand(dst, src);
4952 }
4953 
4954 void Assembler::xchgq(Register dst, Register src) {
4955   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4956   emit_byte(0x87);
4957   emit_byte(0xc0 | encode);
4958 }
4959 
4960 void Assembler::xorq(Register dst, Register src) {
4961   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4962   emit_arith(0x33, 0xC0, dst, src);
4963 }
4964 
4965 void Assembler::xorq(Register dst, Address src) {
4966   InstructionMark im(this);
4967   prefixq(src, dst);
4968   emit_byte(0x33);
4969   emit_operand(dst, src);
4970 }
4971 
4972 #endif // !LP64
4973 
4974 static Assembler::Condition reverse[] = {
4975     Assembler::noOverflow     /* overflow      = 0x0 */ ,
4976     Assembler::overflow       /* noOverflow    = 0x1 */ ,
4977     Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
4978     Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
4979     Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
4980     Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
4981     Assembler::above          /* belowEqual    = 0x6 */ ,
4982     Assembler::belowEqual     /* above         = 0x7 */ ,
4983     Assembler::positive       /* negative      = 0x8 */ ,
4984     Assembler::negative       /* positive      = 0x9 */ ,
4985     Assembler::noParity       /* parity        = 0xa */ ,
4986     Assembler::parity         /* noParity      = 0xb */ ,
4987     Assembler::greaterEqual   /* less          = 0xc */ ,
4988     Assembler::less           /* greaterEqual  = 0xd */ ,
4989     Assembler::greater        /* lessEqual     = 0xe */ ,
4990     Assembler::lessEqual      /* greater       = 0xf, */
4991 
4992 };
4993 
4994 
4995 // Implementation of MacroAssembler
4996 
4997 // First all the versions that have distinct versions depending on 32/64 bit
4998 // Unless the difference is trivial (1 line or so).
4999 
5000 #ifndef _LP64
5001 
5002 // 32bit versions
5003 
5004 Address MacroAssembler::as_Address(AddressLiteral adr) {
5005   return Address(adr.target(), adr.rspec());
5006 }
5007 
5008 Address MacroAssembler::as_Address(ArrayAddress adr) {
5009   return Address::make_array(adr);
5010 }
5011 
5012 int MacroAssembler::biased_locking_enter(Register lock_reg,
5013                                          Register obj_reg,
5014                                          Register swap_reg,
5015                                          Register tmp_reg,
5016                                          bool swap_reg_contains_mark,
5017                                          Label& done,
5018                                          Label* slow_case,
5019                                          BiasedLockingCounters* counters) {
5020   assert(UseBiasedLocking, "why call this otherwise?");
5021   assert(swap_reg == rax, "swap_reg must be rax, for cmpxchg");
5022   assert_different_registers(lock_reg, obj_reg, swap_reg);
5023 
5024   if (PrintBiasedLockingStatistics && counters == NULL)
5025     counters = BiasedLocking::counters();
5026 
5027   bool need_tmp_reg = false;
5028   if (tmp_reg == noreg) {
5029     need_tmp_reg = true;
5030     tmp_reg = lock_reg;
5031   } else {
5032     assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
5033   }
5034   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
5035   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
5036   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
5037   Address saved_mark_addr(lock_reg, 0);
5038 
5039   // Biased locking
5040   // See whether the lock is currently biased toward our thread and
5041   // whether the epoch is still valid
5042   // Note that the runtime guarantees sufficient alignment of JavaThread
5043   // pointers to allow age to be placed into low bits
5044   // First check to see whether biasing is even enabled for this object
5045   Label cas_label;
5046   int null_check_offset = -1;
5047   if (!swap_reg_contains_mark) {
5048     null_check_offset = offset();
5049     movl(swap_reg, mark_addr);
5050   }
5051   if (need_tmp_reg) {
5052     push(tmp_reg);
5053   }
5054   movl(tmp_reg, swap_reg);
5055   andl(tmp_reg, markOopDesc::biased_lock_mask_in_place);
5056   cmpl(tmp_reg, markOopDesc::biased_lock_pattern);
5057   if (need_tmp_reg) {
5058     pop(tmp_reg);
5059   }
5060   jcc(Assembler::notEqual, cas_label);
5061   // The bias pattern is present in the object's header. Need to check
5062   // whether the bias owner and the epoch are both still current.
5063   // Note that because there is no current thread register on x86 we
5064   // need to store off the mark word we read out of the object to
5065   // avoid reloading it and needing to recheck invariants below. This
5066   // store is unfortunate but it makes the overall code shorter and
5067   // simpler.
5068   movl(saved_mark_addr, swap_reg);
5069   if (need_tmp_reg) {
5070     push(tmp_reg);
5071   }
5072   get_thread(tmp_reg);
5073   xorl(swap_reg, tmp_reg);
5074   if (swap_reg_contains_mark) {
5075     null_check_offset = offset();
5076   }
5077   movl(tmp_reg, klass_addr);
5078   xorl(swap_reg, Address(tmp_reg, Klass::prototype_header_offset()));
5079   andl(swap_reg, ~((int) markOopDesc::age_mask_in_place));
5080   if (need_tmp_reg) {
5081     pop(tmp_reg);
5082   }
5083   if (counters != NULL) {
5084     cond_inc32(Assembler::zero,
5085                ExternalAddress((address)counters->biased_lock_entry_count_addr()));
5086   }
5087   jcc(Assembler::equal, done);
5088 
5089   Label try_revoke_bias;
5090   Label try_rebias;
5091 
5092   // At this point we know that the header has the bias pattern and
5093   // that we are not the bias owner in the current epoch. We need to
5094   // figure out more details about the state of the header in order to
5095   // know what operations can be legally performed on the object's
5096   // header.
5097 
5098   // If the low three bits in the xor result aren't clear, that means
5099   // the prototype header is no longer biased and we have to revoke
5100   // the bias on this object.
5101   testl(swap_reg, markOopDesc::biased_lock_mask_in_place);
5102   jcc(Assembler::notZero, try_revoke_bias);
5103 
5104   // Biasing is still enabled for this data type. See whether the
5105   // epoch of the current bias is still valid, meaning that the epoch
5106   // bits of the mark word are equal to the epoch bits of the
5107   // prototype header. (Note that the prototype header's epoch bits
5108   // only change at a safepoint.) If not, attempt to rebias the object
5109   // toward the current thread. Note that we must be absolutely sure
5110   // that the current epoch is invalid in order to do this because
5111   // otherwise the manipulations it performs on the mark word are
5112   // illegal.
5113   testl(swap_reg, markOopDesc::epoch_mask_in_place);
5114   jcc(Assembler::notZero, try_rebias);
5115 
5116   // The epoch of the current bias is still valid but we know nothing
5117   // about the owner; it might be set or it might be clear. Try to
5118   // acquire the bias of the object using an atomic operation. If this
5119   // fails we will go in to the runtime to revoke the object's bias.
5120   // Note that we first construct the presumed unbiased header so we
5121   // don't accidentally blow away another thread's valid bias.
5122   movl(swap_reg, saved_mark_addr);
5123   andl(swap_reg,
5124        markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
5125   if (need_tmp_reg) {
5126     push(tmp_reg);
5127   }
5128   get_thread(tmp_reg);
5129   orl(tmp_reg, swap_reg);
5130   if (os::is_MP()) {
5131     lock();
5132   }
5133   cmpxchgptr(tmp_reg, Address(obj_reg, 0));
5134   if (need_tmp_reg) {
5135     pop(tmp_reg);
5136   }
5137   // If the biasing toward our thread failed, this means that
5138   // another thread succeeded in biasing it toward itself and we
5139   // need to revoke that bias. The revocation will occur in the
5140   // interpreter runtime in the slow case.
5141   if (counters != NULL) {
5142     cond_inc32(Assembler::zero,
5143                ExternalAddress((address)counters->anonymously_biased_lock_entry_count_addr()));
5144   }
5145   if (slow_case != NULL) {
5146     jcc(Assembler::notZero, *slow_case);
5147   }
5148   jmp(done);
5149 
5150   bind(try_rebias);
5151   // At this point we know the epoch has expired, meaning that the
5152   // current "bias owner", if any, is actually invalid. Under these
5153   // circumstances _only_, we are allowed to use the current header's
5154   // value as the comparison value when doing the cas to acquire the
5155   // bias in the current epoch. In other words, we allow transfer of
5156   // the bias from one thread to another directly in this situation.
5157   //
5158   // FIXME: due to a lack of registers we currently blow away the age
5159   // bits in this situation. Should attempt to preserve them.
5160   if (need_tmp_reg) {
5161     push(tmp_reg);
5162   }
5163   get_thread(tmp_reg);
5164   movl(swap_reg, klass_addr);
5165   orl(tmp_reg, Address(swap_reg, Klass::prototype_header_offset()));
5166   movl(swap_reg, saved_mark_addr);
5167   if (os::is_MP()) {
5168     lock();
5169   }
5170   cmpxchgptr(tmp_reg, Address(obj_reg, 0));
5171   if (need_tmp_reg) {
5172     pop(tmp_reg);
5173   }
5174   // If the biasing toward our thread failed, then another thread
5175   // succeeded in biasing it toward itself and we need to revoke that
5176   // bias. The revocation will occur in the runtime in the slow case.
5177   if (counters != NULL) {
5178     cond_inc32(Assembler::zero,
5179                ExternalAddress((address)counters->rebiased_lock_entry_count_addr()));
5180   }
5181   if (slow_case != NULL) {
5182     jcc(Assembler::notZero, *slow_case);
5183   }
5184   jmp(done);
5185 
5186   bind(try_revoke_bias);
5187   // The prototype mark in the klass doesn't have the bias bit set any
5188   // more, indicating that objects of this data type are not supposed
5189   // to be biased any more. We are going to try to reset the mark of
5190   // this object to the prototype value and fall through to the
5191   // CAS-based locking scheme. Note that if our CAS fails, it means
5192   // that another thread raced us for the privilege of revoking the
5193   // bias of this particular object, so it's okay to continue in the
5194   // normal locking code.
5195   //
5196   // FIXME: due to a lack of registers we currently blow away the age
5197   // bits in this situation. Should attempt to preserve them.
5198   movl(swap_reg, saved_mark_addr);
5199   if (need_tmp_reg) {
5200     push(tmp_reg);
5201   }
5202   movl(tmp_reg, klass_addr);
5203   movl(tmp_reg, Address(tmp_reg, Klass::prototype_header_offset()));
5204   if (os::is_MP()) {
5205     lock();
5206   }
5207   cmpxchgptr(tmp_reg, Address(obj_reg, 0));
5208   if (need_tmp_reg) {
5209     pop(tmp_reg);
5210   }
5211   // Fall through to the normal CAS-based lock, because no matter what
5212   // the result of the above CAS, some thread must have succeeded in
5213   // removing the bias bit from the object's header.
5214   if (counters != NULL) {
5215     cond_inc32(Assembler::zero,
5216                ExternalAddress((address)counters->revoked_lock_entry_count_addr()));
5217   }
5218 
5219   bind(cas_label);
5220 
5221   return null_check_offset;
5222 }
5223 void MacroAssembler::call_VM_leaf_base(address entry_point,
5224                                        int number_of_arguments) {
5225   call(RuntimeAddress(entry_point));
5226   increment(rsp, number_of_arguments * wordSize);
5227 }
5228 
5229 void MacroAssembler::cmpoop(Address src1, jobject obj) {
5230   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
5231 }
5232 
5233 void MacroAssembler::cmpoop(Register src1, jobject obj) {
5234   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
5235 }
5236 
5237 void MacroAssembler::extend_sign(Register hi, Register lo) {
5238   // According to Intel Doc. AP-526, "Integer Divide", p.18.
5239   if (VM_Version::is_P6() && hi == rdx && lo == rax) {
5240     cdql();
5241   } else {
5242     movl(hi, lo);
5243     sarl(hi, 31);
5244   }
5245 }
5246 
5247 void MacroAssembler::jC2(Register tmp, Label& L) {
5248   // set parity bit if FPU flag C2 is set (via rax)
5249   save_rax(tmp);
5250   fwait(); fnstsw_ax();
5251   sahf();
5252   restore_rax(tmp);
5253   // branch
5254   jcc(Assembler::parity, L);
5255 }
5256 
5257 void MacroAssembler::jnC2(Register tmp, Label& L) {
5258   // set parity bit if FPU flag C2 is set (via rax)
5259   save_rax(tmp);
5260   fwait(); fnstsw_ax();
5261   sahf();
5262   restore_rax(tmp);
5263   // branch
5264   jcc(Assembler::noParity, L);
5265 }
5266 
5267 // 32bit can do a case table jump in one instruction but we no longer allow the base
5268 // to be installed in the Address class
5269 void MacroAssembler::jump(ArrayAddress entry) {
5270   jmp(as_Address(entry));
5271 }
5272 
5273 // Note: y_lo will be destroyed
5274 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
5275   // Long compare for Java (semantics as described in JVM spec.)
5276   Label high, low, done;
5277 
5278   cmpl(x_hi, y_hi);
5279   jcc(Assembler::less, low);
5280   jcc(Assembler::greater, high);
5281   // x_hi is the return register
5282   xorl(x_hi, x_hi);
5283   cmpl(x_lo, y_lo);
5284   jcc(Assembler::below, low);
5285   jcc(Assembler::equal, done);
5286 
5287   bind(high);
5288   xorl(x_hi, x_hi);
5289   increment(x_hi);
5290   jmp(done);
5291 
5292   bind(low);
5293   xorl(x_hi, x_hi);
5294   decrementl(x_hi);
5295 
5296   bind(done);
5297 }
5298 
5299 void MacroAssembler::lea(Register dst, AddressLiteral src) {
5300     mov_literal32(dst, (int32_t)src.target(), src.rspec());
5301 }
5302 
5303 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
5304   // leal(dst, as_Address(adr));
5305   // see note in movl as to why we must use a move
5306   mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
5307 }
5308 
5309 void MacroAssembler::leave() {
5310   mov(rsp, rbp);
5311   pop(rbp);
5312 }
5313 
5314 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
5315   // Multiplication of two Java long values stored on the stack
5316   // as illustrated below. Result is in rdx:rax.
5317   //
5318   // rsp ---> [  ??  ] \               \
5319   //            ....    | y_rsp_offset  |
5320   //          [ y_lo ] /  (in bytes)    | x_rsp_offset
5321   //          [ y_hi ]                  | (in bytes)
5322   //            ....                    |
5323   //          [ x_lo ]                 /
5324   //          [ x_hi ]
5325   //            ....
5326   //
5327   // Basic idea: lo(result) = lo(x_lo * y_lo)
5328   //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
5329   Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
5330   Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
5331   Label quick;
5332   // load x_hi, y_hi and check if quick
5333   // multiplication is possible
5334   movl(rbx, x_hi);
5335   movl(rcx, y_hi);
5336   movl(rax, rbx);
5337   orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
5338   jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
5339   // do full multiplication
5340   // 1st step
5341   mull(y_lo);                                    // x_hi * y_lo
5342   movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
5343   // 2nd step
5344   movl(rax, x_lo);
5345   mull(rcx);                                     // x_lo * y_hi
5346   addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
5347   // 3rd step
5348   bind(quick);                                   // note: rbx, = 0 if quick multiply!
5349   movl(rax, x_lo);
5350   mull(y_lo);                                    // x_lo * y_lo
5351   addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
5352 }
5353 
5354 void MacroAssembler::lneg(Register hi, Register lo) {
5355   negl(lo);
5356   adcl(hi, 0);
5357   negl(hi);
5358 }
5359 
5360 void MacroAssembler::lshl(Register hi, Register lo) {
5361   // Java shift left long support (semantics as described in JVM spec., p.305)
5362   // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
5363   // shift value is in rcx !
5364   assert(hi != rcx, "must not use rcx");
5365   assert(lo != rcx, "must not use rcx");
5366   const Register s = rcx;                        // shift count
5367   const int      n = BitsPerWord;
5368   Label L;
5369   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
5370   cmpl(s, n);                                    // if (s < n)
5371   jcc(Assembler::less, L);                       // else (s >= n)
5372   movl(hi, lo);                                  // x := x << n
5373   xorl(lo, lo);
5374   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
5375   bind(L);                                       // s (mod n) < n
5376   shldl(hi, lo);                                 // x := x << s
5377   shll(lo);
5378 }
5379 
5380 
5381 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
5382   // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
5383   // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
5384   assert(hi != rcx, "must not use rcx");
5385   assert(lo != rcx, "must not use rcx");
5386   const Register s = rcx;                        // shift count
5387   const int      n = BitsPerWord;
5388   Label L;
5389   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
5390   cmpl(s, n);                                    // if (s < n)
5391   jcc(Assembler::less, L);                       // else (s >= n)
5392   movl(lo, hi);                                  // x := x >> n
5393   if (sign_extension) sarl(hi, 31);
5394   else                xorl(hi, hi);
5395   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
5396   bind(L);                                       // s (mod n) < n
5397   shrdl(lo, hi);                                 // x := x >> s
5398   if (sign_extension) sarl(hi);
5399   else                shrl(hi);
5400 }
5401 
5402 void MacroAssembler::movoop(Register dst, jobject obj) {
5403   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
5404 }
5405 
5406 void MacroAssembler::movoop(Address dst, jobject obj) {
5407   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
5408 }
5409 
5410 void MacroAssembler::movptr(Register dst, AddressLiteral src) {
5411   if (src.is_lval()) {
5412     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
5413   } else {
5414     movl(dst, as_Address(src));
5415   }
5416 }
5417 
5418 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
5419   movl(as_Address(dst), src);
5420 }
5421 
5422 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
5423   movl(dst, as_Address(src));
5424 }
5425 
5426 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
5427 void MacroAssembler::movptr(Address dst, intptr_t src) {
5428   movl(dst, src);
5429 }
5430 
5431 
5432 void MacroAssembler::pop_callee_saved_registers() {
5433   pop(rcx);
5434   pop(rdx);
5435   pop(rdi);
5436   pop(rsi);
5437 }
5438 
5439 void MacroAssembler::pop_fTOS() {
5440   fld_d(Address(rsp, 0));
5441   addl(rsp, 2 * wordSize);
5442 }
5443 
5444 void MacroAssembler::push_callee_saved_registers() {
5445   push(rsi);
5446   push(rdi);
5447   push(rdx);
5448   push(rcx);
5449 }
5450 
5451 void MacroAssembler::push_fTOS() {
5452   subl(rsp, 2 * wordSize);
5453   fstp_d(Address(rsp, 0));
5454 }
5455 
5456 
5457 void MacroAssembler::pushoop(jobject obj) {
5458   push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
5459 }
5460 
5461 
5462 void MacroAssembler::pushptr(AddressLiteral src) {
5463   if (src.is_lval()) {
5464     push_literal32((int32_t)src.target(), src.rspec());
5465   } else {
5466     pushl(as_Address(src));
5467   }
5468 }
5469 
5470 void MacroAssembler::set_word_if_not_zero(Register dst) {
5471   xorl(dst, dst);
5472   set_byte_if_not_zero(dst);
5473 }
5474 
5475 static void pass_arg0(MacroAssembler* masm, Register arg) {
5476   masm->push(arg);
5477 }
5478 
5479 static void pass_arg1(MacroAssembler* masm, Register arg) {
5480   masm->push(arg);
5481 }
5482 
5483 static void pass_arg2(MacroAssembler* masm, Register arg) {
5484   masm->push(arg);
5485 }
5486 
5487 static void pass_arg3(MacroAssembler* masm, Register arg) {
5488   masm->push(arg);
5489 }
5490 
5491 #ifndef PRODUCT
5492 extern "C" void findpc(intptr_t x);
5493 #endif
5494 
5495 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
5496   // In order to get locks to work, we need to fake a in_VM state
5497   JavaThread* thread = JavaThread::current();
5498   JavaThreadState saved_state = thread->thread_state();
5499   thread->set_thread_state(_thread_in_vm);
5500   if (ShowMessageBoxOnError) {
5501     JavaThread* thread = JavaThread::current();
5502     JavaThreadState saved_state = thread->thread_state();
5503     thread->set_thread_state(_thread_in_vm);
5504     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
5505       ttyLocker ttyl;
5506       BytecodeCounter::print();
5507     }
5508     // To see where a verify_oop failed, get $ebx+40/X for this frame.
5509     // This is the value of eip which points to where verify_oop will return.
5510     if (os::message_box(msg, "Execution stopped, print registers?")) {
5511       ttyLocker ttyl;
5512       tty->print_cr("eip = 0x%08x", eip);
5513 #ifndef PRODUCT
5514       if ((WizardMode || Verbose) && PrintMiscellaneous) {
5515         tty->cr();
5516         findpc(eip);
5517         tty->cr();
5518       }
5519 #endif
5520       tty->print_cr("rax = 0x%08x", rax);
5521       tty->print_cr("rbx = 0x%08x", rbx);
5522       tty->print_cr("rcx = 0x%08x", rcx);
5523       tty->print_cr("rdx = 0x%08x", rdx);
5524       tty->print_cr("rdi = 0x%08x", rdi);
5525       tty->print_cr("rsi = 0x%08x", rsi);
5526       tty->print_cr("rbp = 0x%08x", rbp);
5527       tty->print_cr("rsp = 0x%08x", rsp);
5528       BREAKPOINT;
5529       assert(false, "start up GDB");
5530     }
5531   } else {
5532     ttyLocker ttyl;
5533     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
5534     assert(false, err_msg("DEBUG MESSAGE: %s", msg));
5535   }
5536   ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
5537 }
5538 
5539 void MacroAssembler::stop(const char* msg) {
5540   ExternalAddress message((address)msg);
5541   // push address of message
5542   pushptr(message.addr());
5543   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
5544   pusha();                                           // push registers
5545   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
5546   hlt();
5547 }
5548 
5549 void MacroAssembler::warn(const char* msg) {
5550   push_CPU_state();
5551 
5552   ExternalAddress message((address) msg);
5553   // push address of message
5554   pushptr(message.addr());
5555 
5556   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
5557   addl(rsp, wordSize);       // discard argument
5558   pop_CPU_state();
5559 }
5560 
5561 #else // _LP64
5562 
5563 // 64 bit versions
5564 
5565 Address MacroAssembler::as_Address(AddressLiteral adr) {
5566   // amd64 always does this as a pc-rel
5567   // we can be absolute or disp based on the instruction type
5568   // jmp/call are displacements others are absolute
5569   assert(!adr.is_lval(), "must be rval");
5570   assert(reachable(adr), "must be");
5571   return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
5572 
5573 }
5574 
5575 Address MacroAssembler::as_Address(ArrayAddress adr) {
5576   AddressLiteral base = adr.base();
5577   lea(rscratch1, base);
5578   Address index = adr.index();
5579   assert(index._disp == 0, "must not have disp"); // maybe it can?
5580   Address array(rscratch1, index._index, index._scale, index._disp);
5581   return array;
5582 }
5583 
5584 int MacroAssembler::biased_locking_enter(Register lock_reg,
5585                                          Register obj_reg,
5586                                          Register swap_reg,
5587                                          Register tmp_reg,
5588                                          bool swap_reg_contains_mark,
5589                                          Label& done,
5590                                          Label* slow_case,
5591                                          BiasedLockingCounters* counters) {
5592   assert(UseBiasedLocking, "why call this otherwise?");
5593   assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
5594   assert(tmp_reg != noreg, "tmp_reg must be supplied");
5595   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
5596   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
5597   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
5598   Address saved_mark_addr(lock_reg, 0);
5599 
5600   if (PrintBiasedLockingStatistics && counters == NULL)
5601     counters = BiasedLocking::counters();
5602 
5603   // Biased locking
5604   // See whether the lock is currently biased toward our thread and
5605   // whether the epoch is still valid
5606   // Note that the runtime guarantees sufficient alignment of JavaThread
5607   // pointers to allow age to be placed into low bits
5608   // First check to see whether biasing is even enabled for this object
5609   Label cas_label;
5610   int null_check_offset = -1;
5611   if (!swap_reg_contains_mark) {
5612     null_check_offset = offset();
5613     movq(swap_reg, mark_addr);
5614   }
5615   movq(tmp_reg, swap_reg);
5616   andq(tmp_reg, markOopDesc::biased_lock_mask_in_place);
5617   cmpq(tmp_reg, markOopDesc::biased_lock_pattern);
5618   jcc(Assembler::notEqual, cas_label);
5619   // The bias pattern is present in the object's header. Need to check
5620   // whether the bias owner and the epoch are both still current.
5621   load_prototype_header(tmp_reg, obj_reg);
5622   orq(tmp_reg, r15_thread);
5623   xorq(tmp_reg, swap_reg);
5624   andq(tmp_reg, ~((int) markOopDesc::age_mask_in_place));
5625   if (counters != NULL) {
5626     cond_inc32(Assembler::zero,
5627                ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
5628   }
5629   jcc(Assembler::equal, done);
5630 
5631   Label try_revoke_bias;
5632   Label try_rebias;
5633 
5634   // At this point we know that the header has the bias pattern and
5635   // that we are not the bias owner in the current epoch. We need to
5636   // figure out more details about the state of the header in order to
5637   // know what operations can be legally performed on the object's
5638   // header.
5639 
5640   // If the low three bits in the xor result aren't clear, that means
5641   // the prototype header is no longer biased and we have to revoke
5642   // the bias on this object.
5643   testq(tmp_reg, markOopDesc::biased_lock_mask_in_place);
5644   jcc(Assembler::notZero, try_revoke_bias);
5645 
5646   // Biasing is still enabled for this data type. See whether the
5647   // epoch of the current bias is still valid, meaning that the epoch
5648   // bits of the mark word are equal to the epoch bits of the
5649   // prototype header. (Note that the prototype header's epoch bits
5650   // only change at a safepoint.) If not, attempt to rebias the object
5651   // toward the current thread. Note that we must be absolutely sure
5652   // that the current epoch is invalid in order to do this because
5653   // otherwise the manipulations it performs on the mark word are
5654   // illegal.
5655   testq(tmp_reg, markOopDesc::epoch_mask_in_place);
5656   jcc(Assembler::notZero, try_rebias);
5657 
5658   // The epoch of the current bias is still valid but we know nothing
5659   // about the owner; it might be set or it might be clear. Try to
5660   // acquire the bias of the object using an atomic operation. If this
5661   // fails we will go in to the runtime to revoke the object's bias.
5662   // Note that we first construct the presumed unbiased header so we
5663   // don't accidentally blow away another thread's valid bias.
5664   andq(swap_reg,
5665        markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
5666   movq(tmp_reg, swap_reg);
5667   orq(tmp_reg, r15_thread);
5668   if (os::is_MP()) {
5669     lock();
5670   }
5671   cmpxchgq(tmp_reg, Address(obj_reg, 0));
5672   // If the biasing toward our thread failed, this means that
5673   // another thread succeeded in biasing it toward itself and we
5674   // need to revoke that bias. The revocation will occur in the
5675   // interpreter runtime in the slow case.
5676   if (counters != NULL) {
5677     cond_inc32(Assembler::zero,
5678                ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
5679   }
5680   if (slow_case != NULL) {
5681     jcc(Assembler::notZero, *slow_case);
5682   }
5683   jmp(done);
5684 
5685   bind(try_rebias);
5686   // At this point we know the epoch has expired, meaning that the
5687   // current "bias owner", if any, is actually invalid. Under these
5688   // circumstances _only_, we are allowed to use the current header's
5689   // value as the comparison value when doing the cas to acquire the
5690   // bias in the current epoch. In other words, we allow transfer of
5691   // the bias from one thread to another directly in this situation.
5692   //
5693   // FIXME: due to a lack of registers we currently blow away the age
5694   // bits in this situation. Should attempt to preserve them.
5695   load_prototype_header(tmp_reg, obj_reg);
5696   orq(tmp_reg, r15_thread);
5697   if (os::is_MP()) {
5698     lock();
5699   }
5700   cmpxchgq(tmp_reg, Address(obj_reg, 0));
5701   // If the biasing toward our thread failed, then another thread
5702   // succeeded in biasing it toward itself and we need to revoke that
5703   // bias. The revocation will occur in the runtime in the slow case.
5704   if (counters != NULL) {
5705     cond_inc32(Assembler::zero,
5706                ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
5707   }
5708   if (slow_case != NULL) {
5709     jcc(Assembler::notZero, *slow_case);
5710   }
5711   jmp(done);
5712 
5713   bind(try_revoke_bias);
5714   // The prototype mark in the klass doesn't have the bias bit set any
5715   // more, indicating that objects of this data type are not supposed
5716   // to be biased any more. We are going to try to reset the mark of
5717   // this object to the prototype value and fall through to the
5718   // CAS-based locking scheme. Note that if our CAS fails, it means
5719   // that another thread raced us for the privilege of revoking the
5720   // bias of this particular object, so it's okay to continue in the
5721   // normal locking code.
5722   //
5723   // FIXME: due to a lack of registers we currently blow away the age
5724   // bits in this situation. Should attempt to preserve them.
5725   load_prototype_header(tmp_reg, obj_reg);
5726   if (os::is_MP()) {
5727     lock();
5728   }
5729   cmpxchgq(tmp_reg, Address(obj_reg, 0));
5730   // Fall through to the normal CAS-based lock, because no matter what
5731   // the result of the above CAS, some thread must have succeeded in
5732   // removing the bias bit from the object's header.
5733   if (counters != NULL) {
5734     cond_inc32(Assembler::zero,
5735                ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
5736   }
5737 
5738   bind(cas_label);
5739 
5740   return null_check_offset;
5741 }
5742 
5743 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
5744   Label L, E;
5745 
5746 #ifdef _WIN64
5747   // Windows always allocates space for it's register args
5748   assert(num_args <= 4, "only register arguments supported");
5749   subq(rsp,  frame::arg_reg_save_area_bytes);
5750 #endif
5751 
5752   // Align stack if necessary
5753   testl(rsp, 15);
5754   jcc(Assembler::zero, L);
5755 
5756   subq(rsp, 8);
5757   {
5758     call(RuntimeAddress(entry_point));
5759   }
5760   addq(rsp, 8);
5761   jmp(E);
5762 
5763   bind(L);
5764   {
5765     call(RuntimeAddress(entry_point));
5766   }
5767 
5768   bind(E);
5769 
5770 #ifdef _WIN64
5771   // restore stack pointer
5772   addq(rsp, frame::arg_reg_save_area_bytes);
5773 #endif
5774 
5775 }
5776 
5777 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
5778   assert(!src2.is_lval(), "should use cmpptr");
5779 
5780   if (reachable(src2)) {
5781     cmpq(src1, as_Address(src2));
5782   } else {
5783     lea(rscratch1, src2);
5784     Assembler::cmpq(src1, Address(rscratch1, 0));
5785   }
5786 }
5787 
5788 int MacroAssembler::corrected_idivq(Register reg) {
5789   // Full implementation of Java ldiv and lrem; checks for special
5790   // case as described in JVM spec., p.243 & p.271.  The function
5791   // returns the (pc) offset of the idivl instruction - may be needed
5792   // for implicit exceptions.
5793   //
5794   //         normal case                           special case
5795   //
5796   // input : rax: dividend                         min_long
5797   //         reg: divisor   (may not be eax/edx)   -1
5798   //
5799   // output: rax: quotient  (= rax idiv reg)       min_long
5800   //         rdx: remainder (= rax irem reg)       0
5801   assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
5802   static const int64_t min_long = 0x8000000000000000;
5803   Label normal_case, special_case;
5804 
5805   // check for special case
5806   cmp64(rax, ExternalAddress((address) &min_long));
5807   jcc(Assembler::notEqual, normal_case);
5808   xorl(rdx, rdx); // prepare rdx for possible special case (where
5809                   // remainder = 0)
5810   cmpq(reg, -1);
5811   jcc(Assembler::equal, special_case);
5812 
5813   // handle normal case
5814   bind(normal_case);
5815   cdqq();
5816   int idivq_offset = offset();
5817   idivq(reg);
5818 
5819   // normal and special case exit
5820   bind(special_case);
5821 
5822   return idivq_offset;
5823 }
5824 
5825 void MacroAssembler::decrementq(Register reg, int value) {
5826   if (value == min_jint) { subq(reg, value); return; }
5827   if (value <  0) { incrementq(reg, -value); return; }
5828   if (value == 0) {                        ; return; }
5829   if (value == 1 && UseIncDec) { decq(reg) ; return; }
5830   /* else */      { subq(reg, value)       ; return; }
5831 }
5832 
5833 void MacroAssembler::decrementq(Address dst, int value) {
5834   if (value == min_jint) { subq(dst, value); return; }
5835   if (value <  0) { incrementq(dst, -value); return; }
5836   if (value == 0) {                        ; return; }
5837   if (value == 1 && UseIncDec) { decq(dst) ; return; }
5838   /* else */      { subq(dst, value)       ; return; }
5839 }
5840 
5841 void MacroAssembler::incrementq(Register reg, int value) {
5842   if (value == min_jint) { addq(reg, value); return; }
5843   if (value <  0) { decrementq(reg, -value); return; }
5844   if (value == 0) {                        ; return; }
5845   if (value == 1 && UseIncDec) { incq(reg) ; return; }
5846   /* else */      { addq(reg, value)       ; return; }
5847 }
5848 
5849 void MacroAssembler::incrementq(Address dst, int value) {
5850   if (value == min_jint) { addq(dst, value); return; }
5851   if (value <  0) { decrementq(dst, -value); return; }
5852   if (value == 0) {                        ; return; }
5853   if (value == 1 && UseIncDec) { incq(dst) ; return; }
5854   /* else */      { addq(dst, value)       ; return; }
5855 }
5856 
5857 // 32bit can do a case table jump in one instruction but we no longer allow the base
5858 // to be installed in the Address class
5859 void MacroAssembler::jump(ArrayAddress entry) {
5860   lea(rscratch1, entry.base());
5861   Address dispatch = entry.index();
5862   assert(dispatch._base == noreg, "must be");
5863   dispatch._base = rscratch1;
5864   jmp(dispatch);
5865 }
5866 
5867 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
5868   ShouldNotReachHere(); // 64bit doesn't use two regs
5869   cmpq(x_lo, y_lo);
5870 }
5871 
5872 void MacroAssembler::lea(Register dst, AddressLiteral src) {
5873     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
5874 }
5875 
5876 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
5877   mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
5878   movptr(dst, rscratch1);
5879 }
5880 
5881 void MacroAssembler::leave() {
5882   // %%% is this really better? Why not on 32bit too?
5883   emit_byte(0xC9); // LEAVE
5884 }
5885 
5886 void MacroAssembler::lneg(Register hi, Register lo) {
5887   ShouldNotReachHere(); // 64bit doesn't use two regs
5888   negq(lo);
5889 }
5890 
5891 void MacroAssembler::movoop(Register dst, jobject obj) {
5892   mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
5893 }
5894 
5895 void MacroAssembler::movoop(Address dst, jobject obj) {
5896   mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
5897   movq(dst, rscratch1);
5898 }
5899 
5900 void MacroAssembler::movptr(Register dst, AddressLiteral src) {
5901   if (src.is_lval()) {
5902     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
5903   } else {
5904     if (reachable(src)) {
5905       movq(dst, as_Address(src));
5906     } else {
5907       lea(rscratch1, src);
5908       movq(dst, Address(rscratch1,0));
5909     }
5910   }
5911 }
5912 
5913 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
5914   movq(as_Address(dst), src);
5915 }
5916 
5917 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
5918   movq(dst, as_Address(src));
5919 }
5920 
5921 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
5922 void MacroAssembler::movptr(Address dst, intptr_t src) {
5923   mov64(rscratch1, src);
5924   movq(dst, rscratch1);
5925 }
5926 
5927 // These are mostly for initializing NULL
5928 void MacroAssembler::movptr(Address dst, int32_t src) {
5929   movslq(dst, src);
5930 }
5931 
5932 void MacroAssembler::movptr(Register dst, int32_t src) {
5933   mov64(dst, (intptr_t)src);
5934 }
5935 
5936 void MacroAssembler::pushoop(jobject obj) {
5937   movoop(rscratch1, obj);
5938   push(rscratch1);
5939 }
5940 
5941 void MacroAssembler::pushptr(AddressLiteral src) {
5942   lea(rscratch1, src);
5943   if (src.is_lval()) {
5944     push(rscratch1);
5945   } else {
5946     pushq(Address(rscratch1, 0));
5947   }
5948 }
5949 
5950 void MacroAssembler::reset_last_Java_frame(bool clear_fp,
5951                                            bool clear_pc) {
5952   // we must set sp to zero to clear frame
5953   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
5954   // must clear fp, so that compiled frames are not confused; it is
5955   // possible that we need it only for debugging
5956   if (clear_fp) {
5957     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
5958   }
5959 
5960   if (clear_pc) {
5961     movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
5962   }
5963 }
5964 
5965 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
5966                                          Register last_java_fp,
5967                                          address  last_java_pc) {
5968   // determine last_java_sp register
5969   if (!last_java_sp->is_valid()) {
5970     last_java_sp = rsp;
5971   }
5972 
5973   // last_java_fp is optional
5974   if (last_java_fp->is_valid()) {
5975     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
5976            last_java_fp);
5977   }
5978 
5979   // last_java_pc is optional
5980   if (last_java_pc != NULL) {
5981     Address java_pc(r15_thread,
5982                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
5983     lea(rscratch1, InternalAddress(last_java_pc));
5984     movptr(java_pc, rscratch1);
5985   }
5986 
5987   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
5988 }
5989 
5990 static void pass_arg0(MacroAssembler* masm, Register arg) {
5991   if (c_rarg0 != arg ) {
5992     masm->mov(c_rarg0, arg);
5993   }
5994 }
5995 
5996 static void pass_arg1(MacroAssembler* masm, Register arg) {
5997   if (c_rarg1 != arg ) {
5998     masm->mov(c_rarg1, arg);
5999   }
6000 }
6001 
6002 static void pass_arg2(MacroAssembler* masm, Register arg) {
6003   if (c_rarg2 != arg ) {
6004     masm->mov(c_rarg2, arg);
6005   }
6006 }
6007 
6008 static void pass_arg3(MacroAssembler* masm, Register arg) {
6009   if (c_rarg3 != arg ) {
6010     masm->mov(c_rarg3, arg);
6011   }
6012 }
6013 
6014 void MacroAssembler::stop(const char* msg) {
6015   address rip = pc();
6016   pusha(); // get regs on stack
6017   lea(c_rarg0, ExternalAddress((address) msg));
6018   lea(c_rarg1, InternalAddress(rip));
6019   movq(c_rarg2, rsp); // pass pointer to regs array
6020   andq(rsp, -16); // align stack as required by ABI
6021   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
6022   hlt();
6023 }
6024 
6025 void MacroAssembler::warn(const char* msg) {
6026   push(rsp);
6027   andq(rsp, -16);     // align stack as required by push_CPU_state and call
6028 
6029   push_CPU_state();   // keeps alignment at 16 bytes
6030   lea(c_rarg0, ExternalAddress((address) msg));
6031   call_VM_leaf(CAST_FROM_FN_PTR(address, warning), c_rarg0);
6032   pop_CPU_state();
6033   pop(rsp);
6034 }
6035 
6036 #ifndef PRODUCT
6037 extern "C" void findpc(intptr_t x);
6038 #endif
6039 
6040 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
6041   // In order to get locks to work, we need to fake a in_VM state
6042   if (ShowMessageBoxOnError ) {
6043     JavaThread* thread = JavaThread::current();
6044     JavaThreadState saved_state = thread->thread_state();
6045     thread->set_thread_state(_thread_in_vm);
6046 #ifndef PRODUCT
6047     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
6048       ttyLocker ttyl;
6049       BytecodeCounter::print();
6050     }
6051 #endif
6052     // To see where a verify_oop failed, get $ebx+40/X for this frame.
6053     // XXX correct this offset for amd64
6054     // This is the value of eip which points to where verify_oop will return.
6055     if (os::message_box(msg, "Execution stopped, print registers?")) {
6056       ttyLocker ttyl;
6057       tty->print_cr("rip = 0x%016lx", pc);
6058 #ifndef PRODUCT
6059       tty->cr();
6060       findpc(pc);
6061       tty->cr();
6062 #endif
6063       tty->print_cr("rax = 0x%016lx", regs[15]);
6064       tty->print_cr("rbx = 0x%016lx", regs[12]);
6065       tty->print_cr("rcx = 0x%016lx", regs[14]);
6066       tty->print_cr("rdx = 0x%016lx", regs[13]);
6067       tty->print_cr("rdi = 0x%016lx", regs[8]);
6068       tty->print_cr("rsi = 0x%016lx", regs[9]);
6069       tty->print_cr("rbp = 0x%016lx", regs[10]);
6070       tty->print_cr("rsp = 0x%016lx", regs[11]);
6071       tty->print_cr("r8  = 0x%016lx", regs[7]);
6072       tty->print_cr("r9  = 0x%016lx", regs[6]);
6073       tty->print_cr("r10 = 0x%016lx", regs[5]);
6074       tty->print_cr("r11 = 0x%016lx", regs[4]);
6075       tty->print_cr("r12 = 0x%016lx", regs[3]);
6076       tty->print_cr("r13 = 0x%016lx", regs[2]);
6077       tty->print_cr("r14 = 0x%016lx", regs[1]);
6078       tty->print_cr("r15 = 0x%016lx", regs[0]);
6079       BREAKPOINT;
6080     }
6081     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
6082   } else {
6083     ttyLocker ttyl;
6084     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
6085                     msg);
6086     assert(false, err_msg("DEBUG MESSAGE: %s", msg));
6087   }
6088 }
6089 
6090 #endif // _LP64
6091 
6092 // Now versions that are common to 32/64 bit
6093 
6094 void MacroAssembler::addptr(Register dst, int32_t imm32) {
6095   LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
6096 }
6097 
6098 void MacroAssembler::addptr(Register dst, Register src) {
6099   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
6100 }
6101 
6102 void MacroAssembler::addptr(Address dst, Register src) {
6103   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
6104 }
6105 
6106 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
6107   if (reachable(src)) {
6108     Assembler::addsd(dst, as_Address(src));
6109   } else {
6110     lea(rscratch1, src);
6111     Assembler::addsd(dst, Address(rscratch1, 0));
6112   }
6113 }
6114 
6115 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
6116   if (reachable(src)) {
6117     addss(dst, as_Address(src));
6118   } else {
6119     lea(rscratch1, src);
6120     addss(dst, Address(rscratch1, 0));
6121   }
6122 }
6123 
6124 void MacroAssembler::align(int modulus) {
6125   if (offset() % modulus != 0) {
6126     nop(modulus - (offset() % modulus));
6127   }
6128 }
6129 
6130 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src) {
6131   // Used in sign-masking with aligned address.
6132   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
6133   if (reachable(src)) {
6134     Assembler::andpd(dst, as_Address(src));
6135   } else {
6136     lea(rscratch1, src);
6137     Assembler::andpd(dst, Address(rscratch1, 0));
6138   }
6139 }
6140 
6141 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src) {
6142   // Used in sign-masking with aligned address.
6143   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
6144   if (reachable(src)) {
6145     Assembler::andps(dst, as_Address(src));
6146   } else {
6147     lea(rscratch1, src);
6148     Assembler::andps(dst, Address(rscratch1, 0));
6149   }
6150 }
6151 
6152 void MacroAssembler::andptr(Register dst, int32_t imm32) {
6153   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
6154 }
6155 
6156 void MacroAssembler::atomic_incl(AddressLiteral counter_addr) {
6157   pushf();
6158   if (os::is_MP())
6159     lock();
6160   incrementl(counter_addr);
6161   popf();
6162 }
6163 
6164 // Writes to stack successive pages until offset reached to check for
6165 // stack overflow + shadow pages.  This clobbers tmp.
6166 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
6167   movptr(tmp, rsp);
6168   // Bang stack for total size given plus shadow page size.
6169   // Bang one page at a time because large size can bang beyond yellow and
6170   // red zones.
6171   Label loop;
6172   bind(loop);
6173   movl(Address(tmp, (-os::vm_page_size())), size );
6174   subptr(tmp, os::vm_page_size());
6175   subl(size, os::vm_page_size());
6176   jcc(Assembler::greater, loop);
6177 
6178   // Bang down shadow pages too.
6179   // The -1 because we already subtracted 1 page.
6180   for (int i = 0; i< StackShadowPages-1; i++) {
6181     // this could be any sized move but this is can be a debugging crumb
6182     // so the bigger the better.
6183     movptr(Address(tmp, (-i*os::vm_page_size())), size );
6184   }
6185 }
6186 
6187 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
6188   assert(UseBiasedLocking, "why call this otherwise?");
6189 
6190   // Check for biased locking unlock case, which is a no-op
6191   // Note: we do not have to check the thread ID for two reasons.
6192   // First, the interpreter checks for IllegalMonitorStateException at
6193   // a higher level. Second, if the bias was revoked while we held the
6194   // lock, the object could not be rebiased toward another thread, so
6195   // the bias bit would be clear.
6196   movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
6197   andptr(temp_reg, markOopDesc::biased_lock_mask_in_place);
6198   cmpptr(temp_reg, markOopDesc::biased_lock_pattern);
6199   jcc(Assembler::equal, done);
6200 }
6201 
6202 void MacroAssembler::c2bool(Register x) {
6203   // implements x == 0 ? 0 : 1
6204   // note: must only look at least-significant byte of x
6205   //       since C-style booleans are stored in one byte
6206   //       only! (was bug)
6207   andl(x, 0xFF);
6208   setb(Assembler::notZero, x);
6209 }
6210 
6211 // Wouldn't need if AddressLiteral version had new name
6212 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
6213   Assembler::call(L, rtype);
6214 }
6215 
6216 void MacroAssembler::call(Register entry) {
6217   Assembler::call(entry);
6218 }
6219 
6220 void MacroAssembler::call(AddressLiteral entry) {
6221   if (reachable(entry)) {
6222     Assembler::call_literal(entry.target(), entry.rspec());
6223   } else {
6224     lea(rscratch1, entry);
6225     Assembler::call(rscratch1);
6226   }
6227 }
6228 
6229 // Implementation of call_VM versions
6230 
6231 void MacroAssembler::call_VM(Register oop_result,
6232                              address entry_point,
6233                              bool check_exceptions) {
6234   Label C, E;
6235   call(C, relocInfo::none);
6236   jmp(E);
6237 
6238   bind(C);
6239   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
6240   ret(0);
6241 
6242   bind(E);
6243 }
6244 
6245 void MacroAssembler::call_VM(Register oop_result,
6246                              address entry_point,
6247                              Register arg_1,
6248                              bool check_exceptions) {
6249   Label C, E;
6250   call(C, relocInfo::none);
6251   jmp(E);
6252 
6253   bind(C);
6254   pass_arg1(this, arg_1);
6255   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
6256   ret(0);
6257 
6258   bind(E);
6259 }
6260 
6261 void MacroAssembler::call_VM(Register oop_result,
6262                              address entry_point,
6263                              Register arg_1,
6264                              Register arg_2,
6265                              bool check_exceptions) {
6266   Label C, E;
6267   call(C, relocInfo::none);
6268   jmp(E);
6269 
6270   bind(C);
6271 
6272   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6273 
6274   pass_arg2(this, arg_2);
6275   pass_arg1(this, arg_1);
6276   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
6277   ret(0);
6278 
6279   bind(E);
6280 }
6281 
6282 void MacroAssembler::call_VM(Register oop_result,
6283                              address entry_point,
6284                              Register arg_1,
6285                              Register arg_2,
6286                              Register arg_3,
6287                              bool check_exceptions) {
6288   Label C, E;
6289   call(C, relocInfo::none);
6290   jmp(E);
6291 
6292   bind(C);
6293 
6294   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
6295   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
6296   pass_arg3(this, arg_3);
6297 
6298   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6299   pass_arg2(this, arg_2);
6300 
6301   pass_arg1(this, arg_1);
6302   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
6303   ret(0);
6304 
6305   bind(E);
6306 }
6307 
6308 void MacroAssembler::call_VM(Register oop_result,
6309                              Register last_java_sp,
6310                              address entry_point,
6311                              int number_of_arguments,
6312                              bool check_exceptions) {
6313   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
6314   call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
6315 }
6316 
6317 void MacroAssembler::call_VM(Register oop_result,
6318                              Register last_java_sp,
6319                              address entry_point,
6320                              Register arg_1,
6321                              bool check_exceptions) {
6322   pass_arg1(this, arg_1);
6323   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
6324 }
6325 
6326 void MacroAssembler::call_VM(Register oop_result,
6327                              Register last_java_sp,
6328                              address entry_point,
6329                              Register arg_1,
6330                              Register arg_2,
6331                              bool check_exceptions) {
6332 
6333   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6334   pass_arg2(this, arg_2);
6335   pass_arg1(this, arg_1);
6336   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
6337 }
6338 
6339 void MacroAssembler::call_VM(Register oop_result,
6340                              Register last_java_sp,
6341                              address entry_point,
6342                              Register arg_1,
6343                              Register arg_2,
6344                              Register arg_3,
6345                              bool check_exceptions) {
6346   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
6347   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
6348   pass_arg3(this, arg_3);
6349   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6350   pass_arg2(this, arg_2);
6351   pass_arg1(this, arg_1);
6352   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
6353 }
6354 
6355 void MacroAssembler::super_call_VM(Register oop_result,
6356                                    Register last_java_sp,
6357                                    address entry_point,
6358                                    int number_of_arguments,
6359                                    bool check_exceptions) {
6360   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
6361   MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
6362 }
6363 
6364 void MacroAssembler::super_call_VM(Register oop_result,
6365                                    Register last_java_sp,
6366                                    address entry_point,
6367                                    Register arg_1,
6368                                    bool check_exceptions) {
6369   pass_arg1(this, arg_1);
6370   super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
6371 }
6372 
6373 void MacroAssembler::super_call_VM(Register oop_result,
6374                                    Register last_java_sp,
6375                                    address entry_point,
6376                                    Register arg_1,
6377                                    Register arg_2,
6378                                    bool check_exceptions) {
6379 
6380   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6381   pass_arg2(this, arg_2);
6382   pass_arg1(this, arg_1);
6383   super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
6384 }
6385 
6386 void MacroAssembler::super_call_VM(Register oop_result,
6387                                    Register last_java_sp,
6388                                    address entry_point,
6389                                    Register arg_1,
6390                                    Register arg_2,
6391                                    Register arg_3,
6392                                    bool check_exceptions) {
6393   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
6394   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
6395   pass_arg3(this, arg_3);
6396   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6397   pass_arg2(this, arg_2);
6398   pass_arg1(this, arg_1);
6399   super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
6400 }
6401 
6402 void MacroAssembler::call_VM_base(Register oop_result,
6403                                   Register java_thread,
6404                                   Register last_java_sp,
6405                                   address  entry_point,
6406                                   int      number_of_arguments,
6407                                   bool     check_exceptions) {
6408   // determine java_thread register
6409   if (!java_thread->is_valid()) {
6410 #ifdef _LP64
6411     java_thread = r15_thread;
6412 #else
6413     java_thread = rdi;
6414     get_thread(java_thread);
6415 #endif // LP64
6416   }
6417   // determine last_java_sp register
6418   if (!last_java_sp->is_valid()) {
6419     last_java_sp = rsp;
6420   }
6421   // debugging support
6422   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
6423   LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
6424 #ifdef ASSERT
6425   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
6426   // r12 is the heapbase.
6427   LP64_ONLY(if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base");)
6428 #endif // ASSERT
6429 
6430   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
6431   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
6432 
6433   // push java thread (becomes first argument of C function)
6434 
6435   NOT_LP64(push(java_thread); number_of_arguments++);
6436   LP64_ONLY(mov(c_rarg0, r15_thread));
6437 
6438   // set last Java frame before call
6439   assert(last_java_sp != rbp, "can't use ebp/rbp");
6440 
6441   // Only interpreter should have to set fp
6442   set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
6443 
6444   // do the call, remove parameters
6445   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
6446 
6447   // restore the thread (cannot use the pushed argument since arguments
6448   // may be overwritten by C code generated by an optimizing compiler);
6449   // however can use the register value directly if it is callee saved.
6450   if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
6451     // rdi & rsi (also r15) are callee saved -> nothing to do
6452 #ifdef ASSERT
6453     guarantee(java_thread != rax, "change this code");
6454     push(rax);
6455     { Label L;
6456       get_thread(rax);
6457       cmpptr(java_thread, rax);
6458       jcc(Assembler::equal, L);
6459       stop("MacroAssembler::call_VM_base: rdi not callee saved?");
6460       bind(L);
6461     }
6462     pop(rax);
6463 #endif
6464   } else {
6465     get_thread(java_thread);
6466   }
6467   // reset last Java frame
6468   // Only interpreter should have to clear fp
6469   reset_last_Java_frame(java_thread, true, false);
6470 
6471 #ifndef CC_INTERP
6472    // C++ interp handles this in the interpreter
6473   check_and_handle_popframe(java_thread);
6474   check_and_handle_earlyret(java_thread);
6475 #endif /* CC_INTERP */
6476 
6477   if (check_exceptions) {
6478     // check for pending exceptions (java_thread is set upon return)
6479     cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
6480 #ifndef _LP64
6481     jump_cc(Assembler::notEqual,
6482             RuntimeAddress(StubRoutines::forward_exception_entry()));
6483 #else
6484     // This used to conditionally jump to forward_exception however it is
6485     // possible if we relocate that the branch will not reach. So we must jump
6486     // around so we can always reach
6487 
6488     Label ok;
6489     jcc(Assembler::equal, ok);
6490     jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
6491     bind(ok);
6492 #endif // LP64
6493   }
6494 
6495   // get oop result if there is one and reset the value in the thread
6496   if (oop_result->is_valid()) {
6497     movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
6498     movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
6499     verify_oop(oop_result, "broken oop in call_VM_base");
6500   }
6501 }
6502 
6503 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
6504 
6505   // Calculate the value for last_Java_sp
6506   // somewhat subtle. call_VM does an intermediate call
6507   // which places a return address on the stack just under the
6508   // stack pointer as the user finsihed with it. This allows
6509   // use to retrieve last_Java_pc from last_Java_sp[-1].
6510   // On 32bit we then have to push additional args on the stack to accomplish
6511   // the actual requested call. On 64bit call_VM only can use register args
6512   // so the only extra space is the return address that call_VM created.
6513   // This hopefully explains the calculations here.
6514 
6515 #ifdef _LP64
6516   // We've pushed one address, correct last_Java_sp
6517   lea(rax, Address(rsp, wordSize));
6518 #else
6519   lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
6520 #endif // LP64
6521 
6522   call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
6523 
6524 }
6525 
6526 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
6527   call_VM_leaf_base(entry_point, number_of_arguments);
6528 }
6529 
6530 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
6531   pass_arg0(this, arg_0);
6532   call_VM_leaf(entry_point, 1);
6533 }
6534 
6535 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
6536 
6537   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
6538   pass_arg1(this, arg_1);
6539   pass_arg0(this, arg_0);
6540   call_VM_leaf(entry_point, 2);
6541 }
6542 
6543 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
6544   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
6545   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6546   pass_arg2(this, arg_2);
6547   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
6548   pass_arg1(this, arg_1);
6549   pass_arg0(this, arg_0);
6550   call_VM_leaf(entry_point, 3);
6551 }
6552 
6553 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
6554   pass_arg0(this, arg_0);
6555   MacroAssembler::call_VM_leaf_base(entry_point, 1);
6556 }
6557 
6558 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
6559 
6560   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
6561   pass_arg1(this, arg_1);
6562   pass_arg0(this, arg_0);
6563   MacroAssembler::call_VM_leaf_base(entry_point, 2);
6564 }
6565 
6566 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
6567   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
6568   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6569   pass_arg2(this, arg_2);
6570   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
6571   pass_arg1(this, arg_1);
6572   pass_arg0(this, arg_0);
6573   MacroAssembler::call_VM_leaf_base(entry_point, 3);
6574 }
6575 
6576 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
6577   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
6578   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
6579   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
6580   pass_arg3(this, arg_3);
6581   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
6582   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6583   pass_arg2(this, arg_2);
6584   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
6585   pass_arg1(this, arg_1);
6586   pass_arg0(this, arg_0);
6587   MacroAssembler::call_VM_leaf_base(entry_point, 4);
6588 }
6589 
6590 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
6591 }
6592 
6593 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
6594 }
6595 
6596 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
6597   if (reachable(src1)) {
6598     cmpl(as_Address(src1), imm);
6599   } else {
6600     lea(rscratch1, src1);
6601     cmpl(Address(rscratch1, 0), imm);
6602   }
6603 }
6604 
6605 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
6606   assert(!src2.is_lval(), "use cmpptr");
6607   if (reachable(src2)) {
6608     cmpl(src1, as_Address(src2));
6609   } else {
6610     lea(rscratch1, src2);
6611     cmpl(src1, Address(rscratch1, 0));
6612   }
6613 }
6614 
6615 void MacroAssembler::cmp32(Register src1, int32_t imm) {
6616   Assembler::cmpl(src1, imm);
6617 }
6618 
6619 void MacroAssembler::cmp32(Register src1, Address src2) {
6620   Assembler::cmpl(src1, src2);
6621 }
6622 
6623 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
6624   ucomisd(opr1, opr2);
6625 
6626   Label L;
6627   if (unordered_is_less) {
6628     movl(dst, -1);
6629     jcc(Assembler::parity, L);
6630     jcc(Assembler::below , L);
6631     movl(dst, 0);
6632     jcc(Assembler::equal , L);
6633     increment(dst);
6634   } else { // unordered is greater
6635     movl(dst, 1);
6636     jcc(Assembler::parity, L);
6637     jcc(Assembler::above , L);
6638     movl(dst, 0);
6639     jcc(Assembler::equal , L);
6640     decrementl(dst);
6641   }
6642   bind(L);
6643 }
6644 
6645 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
6646   ucomiss(opr1, opr2);
6647 
6648   Label L;
6649   if (unordered_is_less) {
6650     movl(dst, -1);
6651     jcc(Assembler::parity, L);
6652     jcc(Assembler::below , L);
6653     movl(dst, 0);
6654     jcc(Assembler::equal , L);
6655     increment(dst);
6656   } else { // unordered is greater
6657     movl(dst, 1);
6658     jcc(Assembler::parity, L);
6659     jcc(Assembler::above , L);
6660     movl(dst, 0);
6661     jcc(Assembler::equal , L);
6662     decrementl(dst);
6663   }
6664   bind(L);
6665 }
6666 
6667 
6668 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
6669   if (reachable(src1)) {
6670     cmpb(as_Address(src1), imm);
6671   } else {
6672     lea(rscratch1, src1);
6673     cmpb(Address(rscratch1, 0), imm);
6674   }
6675 }
6676 
6677 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
6678 #ifdef _LP64
6679   if (src2.is_lval()) {
6680     movptr(rscratch1, src2);
6681     Assembler::cmpq(src1, rscratch1);
6682   } else if (reachable(src2)) {
6683     cmpq(src1, as_Address(src2));
6684   } else {
6685     lea(rscratch1, src2);
6686     Assembler::cmpq(src1, Address(rscratch1, 0));
6687   }
6688 #else
6689   if (src2.is_lval()) {
6690     cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
6691   } else {
6692     cmpl(src1, as_Address(src2));
6693   }
6694 #endif // _LP64
6695 }
6696 
6697 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
6698   assert(src2.is_lval(), "not a mem-mem compare");
6699 #ifdef _LP64
6700   // moves src2's literal address
6701   movptr(rscratch1, src2);
6702   Assembler::cmpq(src1, rscratch1);
6703 #else
6704   cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
6705 #endif // _LP64
6706 }
6707 
6708 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
6709   if (reachable(adr)) {
6710     if (os::is_MP())
6711       lock();
6712     cmpxchgptr(reg, as_Address(adr));
6713   } else {
6714     lea(rscratch1, adr);
6715     if (os::is_MP())
6716       lock();
6717     cmpxchgptr(reg, Address(rscratch1, 0));
6718   }
6719 }
6720 
6721 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
6722   LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
6723 }
6724 
6725 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
6726   if (reachable(src)) {
6727     Assembler::comisd(dst, as_Address(src));
6728   } else {
6729     lea(rscratch1, src);
6730     Assembler::comisd(dst, Address(rscratch1, 0));
6731   }
6732 }
6733 
6734 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
6735   if (reachable(src)) {
6736     Assembler::comiss(dst, as_Address(src));
6737   } else {
6738     lea(rscratch1, src);
6739     Assembler::comiss(dst, Address(rscratch1, 0));
6740   }
6741 }
6742 
6743 
6744 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
6745   Condition negated_cond = negate_condition(cond);
6746   Label L;
6747   jcc(negated_cond, L);
6748   atomic_incl(counter_addr);
6749   bind(L);
6750 }
6751 
6752 int MacroAssembler::corrected_idivl(Register reg) {
6753   // Full implementation of Java idiv and irem; checks for
6754   // special case as described in JVM spec., p.243 & p.271.
6755   // The function returns the (pc) offset of the idivl
6756   // instruction - may be needed for implicit exceptions.
6757   //
6758   //         normal case                           special case
6759   //
6760   // input : rax,: dividend                         min_int
6761   //         reg: divisor   (may not be rax,/rdx)   -1
6762   //
6763   // output: rax,: quotient  (= rax, idiv reg)       min_int
6764   //         rdx: remainder (= rax, irem reg)       0
6765   assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
6766   const int min_int = 0x80000000;
6767   Label normal_case, special_case;
6768 
6769   // check for special case
6770   cmpl(rax, min_int);
6771   jcc(Assembler::notEqual, normal_case);
6772   xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
6773   cmpl(reg, -1);
6774   jcc(Assembler::equal, special_case);
6775 
6776   // handle normal case
6777   bind(normal_case);
6778   cdql();
6779   int idivl_offset = offset();
6780   idivl(reg);
6781 
6782   // normal and special case exit
6783   bind(special_case);
6784 
6785   return idivl_offset;
6786 }
6787 
6788 
6789 
6790 void MacroAssembler::decrementl(Register reg, int value) {
6791   if (value == min_jint) {subl(reg, value) ; return; }
6792   if (value <  0) { incrementl(reg, -value); return; }
6793   if (value == 0) {                        ; return; }
6794   if (value == 1 && UseIncDec) { decl(reg) ; return; }
6795   /* else */      { subl(reg, value)       ; return; }
6796 }
6797 
6798 void MacroAssembler::decrementl(Address dst, int value) {
6799   if (value == min_jint) {subl(dst, value) ; return; }
6800   if (value <  0) { incrementl(dst, -value); return; }
6801   if (value == 0) {                        ; return; }
6802   if (value == 1 && UseIncDec) { decl(dst) ; return; }
6803   /* else */      { subl(dst, value)       ; return; }
6804 }
6805 
6806 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
6807   assert (shift_value > 0, "illegal shift value");
6808   Label _is_positive;
6809   testl (reg, reg);
6810   jcc (Assembler::positive, _is_positive);
6811   int offset = (1 << shift_value) - 1 ;
6812 
6813   if (offset == 1) {
6814     incrementl(reg);
6815   } else {
6816     addl(reg, offset);
6817   }
6818 
6819   bind (_is_positive);
6820   sarl(reg, shift_value);
6821 }
6822 
6823 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
6824   if (reachable(src)) {
6825     Assembler::divsd(dst, as_Address(src));
6826   } else {
6827     lea(rscratch1, src);
6828     Assembler::divsd(dst, Address(rscratch1, 0));
6829   }
6830 }
6831 
6832 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
6833   if (reachable(src)) {
6834     Assembler::divss(dst, as_Address(src));
6835   } else {
6836     lea(rscratch1, src);
6837     Assembler::divss(dst, Address(rscratch1, 0));
6838   }
6839 }
6840 
6841 // !defined(COMPILER2) is because of stupid core builds
6842 #if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2)
6843 void MacroAssembler::empty_FPU_stack() {
6844   if (VM_Version::supports_mmx()) {
6845     emms();
6846   } else {
6847     for (int i = 8; i-- > 0; ) ffree(i);
6848   }
6849 }
6850 #endif // !LP64 || C1 || !C2
6851 
6852 
6853 // Defines obj, preserves var_size_in_bytes
6854 void MacroAssembler::eden_allocate(Register obj,
6855                                    Register var_size_in_bytes,
6856                                    int con_size_in_bytes,
6857                                    Register t1,
6858                                    Label& slow_case) {
6859   assert(obj == rax, "obj must be in rax, for cmpxchg");
6860   assert_different_registers(obj, var_size_in_bytes, t1);
6861   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
6862     jmp(slow_case);
6863   } else {
6864     Register end = t1;
6865     Label retry;
6866     bind(retry);
6867     ExternalAddress heap_top((address) Universe::heap()->top_addr());
6868     movptr(obj, heap_top);
6869     if (var_size_in_bytes == noreg) {
6870       lea(end, Address(obj, con_size_in_bytes));
6871     } else {
6872       lea(end, Address(obj, var_size_in_bytes, Address::times_1));
6873     }
6874     // if end < obj then we wrapped around => object too long => slow case
6875     cmpptr(end, obj);
6876     jcc(Assembler::below, slow_case);
6877     cmpptr(end, ExternalAddress((address) Universe::heap()->end_addr()));
6878     jcc(Assembler::above, slow_case);
6879     // Compare obj with the top addr, and if still equal, store the new top addr in
6880     // end at the address of the top addr pointer. Sets ZF if was equal, and clears
6881     // it otherwise. Use lock prefix for atomicity on MPs.
6882     locked_cmpxchgptr(end, heap_top);
6883     jcc(Assembler::notEqual, retry);
6884   }
6885 }
6886 
6887 void MacroAssembler::enter() {
6888   push(rbp);
6889   mov(rbp, rsp);
6890 }
6891 
6892 // A 5 byte nop that is safe for patching (see patch_verified_entry)
6893 void MacroAssembler::fat_nop() {
6894   if (UseAddressNop) {
6895     addr_nop_5();
6896   } else {
6897     emit_byte(0x26); // es:
6898     emit_byte(0x2e); // cs:
6899     emit_byte(0x64); // fs:
6900     emit_byte(0x65); // gs:
6901     emit_byte(0x90);
6902   }
6903 }
6904 
6905 void MacroAssembler::fcmp(Register tmp) {
6906   fcmp(tmp, 1, true, true);
6907 }
6908 
6909 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
6910   assert(!pop_right || pop_left, "usage error");
6911   if (VM_Version::supports_cmov()) {
6912     assert(tmp == noreg, "unneeded temp");
6913     if (pop_left) {
6914       fucomip(index);
6915     } else {
6916       fucomi(index);
6917     }
6918     if (pop_right) {
6919       fpop();
6920     }
6921   } else {
6922     assert(tmp != noreg, "need temp");
6923     if (pop_left) {
6924       if (pop_right) {
6925         fcompp();
6926       } else {
6927         fcomp(index);
6928       }
6929     } else {
6930       fcom(index);
6931     }
6932     // convert FPU condition into eflags condition via rax,
6933     save_rax(tmp);
6934     fwait(); fnstsw_ax();
6935     sahf();
6936     restore_rax(tmp);
6937   }
6938   // condition codes set as follows:
6939   //
6940   // CF (corresponds to C0) if x < y
6941   // PF (corresponds to C2) if unordered
6942   // ZF (corresponds to C3) if x = y
6943 }
6944 
6945 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
6946   fcmp2int(dst, unordered_is_less, 1, true, true);
6947 }
6948 
6949 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
6950   fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
6951   Label L;
6952   if (unordered_is_less) {
6953     movl(dst, -1);
6954     jcc(Assembler::parity, L);
6955     jcc(Assembler::below , L);
6956     movl(dst, 0);
6957     jcc(Assembler::equal , L);
6958     increment(dst);
6959   } else { // unordered is greater
6960     movl(dst, 1);
6961     jcc(Assembler::parity, L);
6962     jcc(Assembler::above , L);
6963     movl(dst, 0);
6964     jcc(Assembler::equal , L);
6965     decrementl(dst);
6966   }
6967   bind(L);
6968 }
6969 
6970 void MacroAssembler::fld_d(AddressLiteral src) {
6971   fld_d(as_Address(src));
6972 }
6973 
6974 void MacroAssembler::fld_s(AddressLiteral src) {
6975   fld_s(as_Address(src));
6976 }
6977 
6978 void MacroAssembler::fld_x(AddressLiteral src) {
6979   Assembler::fld_x(as_Address(src));
6980 }
6981 
6982 void MacroAssembler::fldcw(AddressLiteral src) {
6983   Assembler::fldcw(as_Address(src));
6984 }
6985 
6986 void MacroAssembler::pow_exp_core_encoding() {
6987   // kills rax, rcx, rdx
6988   subptr(rsp,sizeof(jdouble));
6989   // computes 2^X. Stack: X ...
6990   // f2xm1 computes 2^X-1 but only operates on -1<=X<=1. Get int(X) and
6991   // keep it on the thread's stack to compute 2^int(X) later
6992   // then compute 2^(X-int(X)) as (2^(X-int(X)-1+1)
6993   // final result is obtained with: 2^X = 2^int(X) * 2^(X-int(X))
6994   fld_s(0);                 // Stack: X X ...
6995   frndint();                // Stack: int(X) X ...
6996   fsuba(1);                 // Stack: int(X) X-int(X) ...
6997   fistp_s(Address(rsp,0));  // move int(X) as integer to thread's stack. Stack: X-int(X) ...
6998   f2xm1();                  // Stack: 2^(X-int(X))-1 ...
6999   fld1();                   // Stack: 1 2^(X-int(X))-1 ...
7000   faddp(1);                 // Stack: 2^(X-int(X))
7001   // computes 2^(int(X)): add exponent bias (1023) to int(X), then
7002   // shift int(X)+1023 to exponent position.
7003   // Exponent is limited to 11 bits if int(X)+1023 does not fit in 11
7004   // bits, set result to NaN. 0x000 and 0x7FF are reserved exponent
7005   // values so detect them and set result to NaN.
7006   movl(rax,Address(rsp,0));
7007   movl(rcx, -2048); // 11 bit mask and valid NaN binary encoding
7008   addl(rax, 1023);
7009   movl(rdx,rax);
7010   shll(rax,20);
7011   // Check that 0 < int(X)+1023 < 2047. Otherwise set rax to NaN.
7012   addl(rdx,1);
7013   // Check that 1 < int(X)+1023+1 < 2048
7014   // in 3 steps:
7015   // 1- (int(X)+1023+1)&-2048 == 0 => 0 <= int(X)+1023+1 < 2048
7016   // 2- (int(X)+1023+1)&-2048 != 0
7017   // 3- (int(X)+1023+1)&-2048 != 1
7018   // Do 2- first because addl just updated the flags.
7019   cmov32(Assembler::equal,rax,rcx);
7020   cmpl(rdx,1);
7021   cmov32(Assembler::equal,rax,rcx);
7022   testl(rdx,rcx);
7023   cmov32(Assembler::notEqual,rax,rcx);
7024   movl(Address(rsp,4),rax);
7025   movl(Address(rsp,0),0);
7026   fmul_d(Address(rsp,0));   // Stack: 2^X ...
7027   addptr(rsp,sizeof(jdouble));
7028 }
7029 
7030 void MacroAssembler::increase_precision() {
7031   subptr(rsp, BytesPerWord);
7032   fnstcw(Address(rsp, 0));
7033   movl(rax, Address(rsp, 0));
7034   orl(rax, 0x300);
7035   push(rax);
7036   fldcw(Address(rsp, 0));
7037   pop(rax);
7038 }
7039 
7040 void MacroAssembler::restore_precision() {
7041   fldcw(Address(rsp, 0));
7042   addptr(rsp, BytesPerWord);
7043 }
7044 
7045 void MacroAssembler::fast_pow() {
7046   // computes X^Y = 2^(Y * log2(X))
7047   // if fast computation is not possible, result is NaN. Requires
7048   // fallback from user of this macro.
7049   // increase precision for intermediate steps of the computation
7050   increase_precision();
7051   fyl2x();                 // Stack: (Y*log2(X)) ...
7052   pow_exp_core_encoding(); // Stack: exp(X) ...
7053   restore_precision();
7054 }
7055 
7056 void MacroAssembler::fast_exp() {
7057   // computes exp(X) = 2^(X * log2(e))
7058   // if fast computation is not possible, result is NaN. Requires
7059   // fallback from user of this macro.
7060   // increase precision for intermediate steps of the computation
7061   increase_precision();
7062   fldl2e();                // Stack: log2(e) X ...
7063   fmulp(1);                // Stack: (X*log2(e)) ...
7064   pow_exp_core_encoding(); // Stack: exp(X) ...
7065   restore_precision();
7066 }
7067 
7068 void MacroAssembler::pow_or_exp(bool is_exp, int num_fpu_regs_in_use) {
7069   // kills rax, rcx, rdx
7070   // pow and exp needs 2 extra registers on the fpu stack.
7071   Label slow_case, done;
7072   Register tmp = noreg;
7073   if (!VM_Version::supports_cmov()) {
7074     // fcmp needs a temporary so preserve rdx,
7075     tmp = rdx;
7076   }
7077   Register tmp2 = rax;
7078   Register tmp3 = rcx;
7079 
7080   if (is_exp) {
7081     // Stack: X
7082     fld_s(0);                   // duplicate argument for runtime call. Stack: X X
7083     fast_exp();                 // Stack: exp(X) X
7084     fcmp(tmp, 0, false, false); // Stack: exp(X) X
7085     // exp(X) not equal to itself: exp(X) is NaN go to slow case.
7086     jcc(Assembler::parity, slow_case);
7087     // get rid of duplicate argument. Stack: exp(X)
7088     if (num_fpu_regs_in_use > 0) {
7089       fxch();
7090       fpop();
7091     } else {
7092       ffree(1);
7093     }
7094     jmp(done);
7095   } else {
7096     // Stack: X Y
7097     Label x_negative, y_odd;
7098 
7099     fldz();                     // Stack: 0 X Y
7100     fcmp(tmp, 1, true, false);  // Stack: X Y
7101     jcc(Assembler::above, x_negative);
7102 
7103     // X >= 0
7104 
7105     fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
7106     fld_s(1);                   // Stack: X Y X Y
7107     fast_pow();                 // Stack: X^Y X Y
7108     fcmp(tmp, 0, false, false); // Stack: X^Y X Y
7109     // X^Y not equal to itself: X^Y is NaN go to slow case.
7110     jcc(Assembler::parity, slow_case);
7111     // get rid of duplicate arguments. Stack: X^Y
7112     if (num_fpu_regs_in_use > 0) {
7113       fxch(); fpop();
7114       fxch(); fpop();
7115     } else {
7116       ffree(2);
7117       ffree(1);
7118     }
7119     jmp(done);
7120 
7121     // X <= 0
7122     bind(x_negative);
7123 
7124     fld_s(1);                   // Stack: Y X Y
7125     frndint();                  // Stack: int(Y) X Y
7126     fcmp(tmp, 2, false, false); // Stack: int(Y) X Y
7127     jcc(Assembler::notEqual, slow_case);
7128 
7129     subptr(rsp, 8);
7130 
7131     // For X^Y, when X < 0, Y has to be an integer and the final
7132     // result depends on whether it's odd or even. We just checked
7133     // that int(Y) == Y.  We move int(Y) to gp registers as a 64 bit
7134     // integer to test its parity. If int(Y) is huge and doesn't fit
7135     // in the 64 bit integer range, the integer indefinite value will
7136     // end up in the gp registers. Huge numbers are all even, the
7137     // integer indefinite number is even so it's fine.
7138 
7139 #ifdef ASSERT
7140     // Let's check we don't end up with an integer indefinite number
7141     // when not expected. First test for huge numbers: check whether
7142     // int(Y)+1 == int(Y) which is true for very large numbers and
7143     // those are all even. A 64 bit integer is guaranteed to not
7144     // overflow for numbers where y+1 != y (when precision is set to
7145     // double precision).
7146     Label y_not_huge;
7147 
7148     fld1();                     // Stack: 1 int(Y) X Y
7149     fadd(1);                    // Stack: 1+int(Y) int(Y) X Y
7150 
7151 #ifdef _LP64
7152     // trip to memory to force the precision down from double extended
7153     // precision
7154     fstp_d(Address(rsp, 0));
7155     fld_d(Address(rsp, 0));
7156 #endif
7157 
7158     fcmp(tmp, 1, true, false);  // Stack: int(Y) X Y
7159 #endif
7160 
7161     // move int(Y) as 64 bit integer to thread's stack
7162     fistp_d(Address(rsp,0));    // Stack: X Y
7163 
7164 #ifdef ASSERT
7165     jcc(Assembler::notEqual, y_not_huge);
7166 
7167     // Y is huge so we know it's even. It may not fit in a 64 bit
7168     // integer and we don't want the debug code below to see the
7169     // integer indefinite value so overwrite int(Y) on the thread's
7170     // stack with 0.
7171     movl(Address(rsp, 0), 0);
7172     movl(Address(rsp, 4), 0);
7173 
7174     bind(y_not_huge);
7175 #endif
7176 
7177     fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
7178     fld_s(1);                   // Stack: X Y X Y
7179     fabs();                     // Stack: abs(X) Y X Y
7180     fast_pow();                 // Stack: abs(X)^Y X Y
7181     fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y
7182     // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case.
7183 
7184     pop(tmp2);
7185     NOT_LP64(pop(tmp3));
7186     jcc(Assembler::parity, slow_case);
7187 
7188 #ifdef ASSERT
7189     // Check that int(Y) is not integer indefinite value (int
7190     // overflow). Shouldn't happen because for values that would
7191     // overflow, 1+int(Y)==Y which was tested earlier.
7192 #ifndef _LP64
7193     {
7194       Label integer;
7195       testl(tmp2, tmp2);
7196       jcc(Assembler::notZero, integer);
7197       cmpl(tmp3, 0x80000000);
7198       jcc(Assembler::notZero, integer);
7199       stop("integer indefinite value shouldn't be seen here");
7200       bind(integer);
7201     }
7202 #else
7203     {
7204       Label integer;
7205       mov(tmp3, tmp2); // preserve tmp2 for parity check below
7206       shlq(tmp3, 1);
7207       jcc(Assembler::carryClear, integer);
7208       jcc(Assembler::notZero, integer);
7209       stop("integer indefinite value shouldn't be seen here");
7210       bind(integer);
7211     }
7212 #endif
7213 #endif
7214 
7215     // get rid of duplicate arguments. Stack: X^Y
7216     if (num_fpu_regs_in_use > 0) {
7217       fxch(); fpop();
7218       fxch(); fpop();
7219     } else {
7220       ffree(2);
7221       ffree(1);
7222     }
7223 
7224     testl(tmp2, 1);
7225     jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y
7226     // X <= 0, Y even: X^Y = -abs(X)^Y
7227 
7228     fchs();                     // Stack: -abs(X)^Y Y
7229     jmp(done);
7230   }
7231 
7232   // slow case: runtime call
7233   bind(slow_case);
7234 
7235   fpop();                       // pop incorrect result or int(Y)
7236 
7237   fp_runtime_fallback(is_exp ? CAST_FROM_FN_PTR(address, SharedRuntime::dexp) : CAST_FROM_FN_PTR(address, SharedRuntime::dpow),
7238                       is_exp ? 1 : 2, num_fpu_regs_in_use);
7239 
7240   // Come here with result in F-TOS
7241   bind(done);
7242 }
7243 
7244 void MacroAssembler::fpop() {
7245   ffree();
7246   fincstp();
7247 }
7248 
7249 void MacroAssembler::fremr(Register tmp) {
7250   save_rax(tmp);
7251   { Label L;
7252     bind(L);
7253     fprem();
7254     fwait(); fnstsw_ax();
7255 #ifdef _LP64
7256     testl(rax, 0x400);
7257     jcc(Assembler::notEqual, L);
7258 #else
7259     sahf();
7260     jcc(Assembler::parity, L);
7261 #endif // _LP64
7262   }
7263   restore_rax(tmp);
7264   // Result is in ST0.
7265   // Note: fxch & fpop to get rid of ST1
7266   // (otherwise FPU stack could overflow eventually)
7267   fxch(1);
7268   fpop();
7269 }
7270 
7271 
7272 void MacroAssembler::incrementl(AddressLiteral dst) {
7273   if (reachable(dst)) {
7274     incrementl(as_Address(dst));
7275   } else {
7276     lea(rscratch1, dst);
7277     incrementl(Address(rscratch1, 0));
7278   }
7279 }
7280 
7281 void MacroAssembler::incrementl(ArrayAddress dst) {
7282   incrementl(as_Address(dst));
7283 }
7284 
7285 void MacroAssembler::incrementl(Register reg, int value) {
7286   if (value == min_jint) {addl(reg, value) ; return; }
7287   if (value <  0) { decrementl(reg, -value); return; }
7288   if (value == 0) {                        ; return; }
7289   if (value == 1 && UseIncDec) { incl(reg) ; return; }
7290   /* else */      { addl(reg, value)       ; return; }
7291 }
7292 
7293 void MacroAssembler::incrementl(Address dst, int value) {
7294   if (value == min_jint) {addl(dst, value) ; return; }
7295   if (value <  0) { decrementl(dst, -value); return; }
7296   if (value == 0) {                        ; return; }
7297   if (value == 1 && UseIncDec) { incl(dst) ; return; }
7298   /* else */      { addl(dst, value)       ; return; }
7299 }
7300 
7301 void MacroAssembler::jump(AddressLiteral dst) {
7302   if (reachable(dst)) {
7303     jmp_literal(dst.target(), dst.rspec());
7304   } else {
7305     lea(rscratch1, dst);
7306     jmp(rscratch1);
7307   }
7308 }
7309 
7310 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
7311   if (reachable(dst)) {
7312     InstructionMark im(this);
7313     relocate(dst.reloc());
7314     const int short_size = 2;
7315     const int long_size = 6;
7316     int offs = (intptr_t)dst.target() - ((intptr_t)_code_pos);
7317     if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
7318       // 0111 tttn #8-bit disp
7319       emit_byte(0x70 | cc);
7320       emit_byte((offs - short_size) & 0xFF);
7321     } else {
7322       // 0000 1111 1000 tttn #32-bit disp
7323       emit_byte(0x0F);
7324       emit_byte(0x80 | cc);
7325       emit_long(offs - long_size);
7326     }
7327   } else {
7328 #ifdef ASSERT
7329     warning("reversing conditional branch");
7330 #endif /* ASSERT */
7331     Label skip;
7332     jccb(reverse[cc], skip);
7333     lea(rscratch1, dst);
7334     Assembler::jmp(rscratch1);
7335     bind(skip);
7336   }
7337 }
7338 
7339 void MacroAssembler::ldmxcsr(AddressLiteral src) {
7340   if (reachable(src)) {
7341     Assembler::ldmxcsr(as_Address(src));
7342   } else {
7343     lea(rscratch1, src);
7344     Assembler::ldmxcsr(Address(rscratch1, 0));
7345   }
7346 }
7347 
7348 int MacroAssembler::load_signed_byte(Register dst, Address src) {
7349   int off;
7350   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
7351     off = offset();
7352     movsbl(dst, src); // movsxb
7353   } else {
7354     off = load_unsigned_byte(dst, src);
7355     shll(dst, 24);
7356     sarl(dst, 24);
7357   }
7358   return off;
7359 }
7360 
7361 // Note: load_signed_short used to be called load_signed_word.
7362 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
7363 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
7364 // The term "word" in HotSpot means a 32- or 64-bit machine word.
7365 int MacroAssembler::load_signed_short(Register dst, Address src) {
7366   int off;
7367   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
7368     // This is dubious to me since it seems safe to do a signed 16 => 64 bit
7369     // version but this is what 64bit has always done. This seems to imply
7370     // that users are only using 32bits worth.
7371     off = offset();
7372     movswl(dst, src); // movsxw
7373   } else {
7374     off = load_unsigned_short(dst, src);
7375     shll(dst, 16);
7376     sarl(dst, 16);
7377   }
7378   return off;
7379 }
7380 
7381 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
7382   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
7383   // and "3.9 Partial Register Penalties", p. 22).
7384   int off;
7385   if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
7386     off = offset();
7387     movzbl(dst, src); // movzxb
7388   } else {
7389     xorl(dst, dst);
7390     off = offset();
7391     movb(dst, src);
7392   }
7393   return off;
7394 }
7395 
7396 // Note: load_unsigned_short used to be called load_unsigned_word.
7397 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
7398   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
7399   // and "3.9 Partial Register Penalties", p. 22).
7400   int off;
7401   if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
7402     off = offset();
7403     movzwl(dst, src); // movzxw
7404   } else {
7405     xorl(dst, dst);
7406     off = offset();
7407     movw(dst, src);
7408   }
7409   return off;
7410 }
7411 
7412 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
7413   switch (size_in_bytes) {
7414 #ifndef _LP64
7415   case  8:
7416     assert(dst2 != noreg, "second dest register required");
7417     movl(dst,  src);
7418     movl(dst2, src.plus_disp(BytesPerInt));
7419     break;
7420 #else
7421   case  8:  movq(dst, src); break;
7422 #endif
7423   case  4:  movl(dst, src); break;
7424   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
7425   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
7426   default:  ShouldNotReachHere();
7427   }
7428 }
7429 
7430 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
7431   switch (size_in_bytes) {
7432 #ifndef _LP64
7433   case  8:
7434     assert(src2 != noreg, "second source register required");
7435     movl(dst,                        src);
7436     movl(dst.plus_disp(BytesPerInt), src2);
7437     break;
7438 #else
7439   case  8:  movq(dst, src); break;
7440 #endif
7441   case  4:  movl(dst, src); break;
7442   case  2:  movw(dst, src); break;
7443   case  1:  movb(dst, src); break;
7444   default:  ShouldNotReachHere();
7445   }
7446 }
7447 
7448 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
7449   if (reachable(dst)) {
7450     movl(as_Address(dst), src);
7451   } else {
7452     lea(rscratch1, dst);
7453     movl(Address(rscratch1, 0), src);
7454   }
7455 }
7456 
7457 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
7458   if (reachable(src)) {
7459     movl(dst, as_Address(src));
7460   } else {
7461     lea(rscratch1, src);
7462     movl(dst, Address(rscratch1, 0));
7463   }
7464 }
7465 
7466 // C++ bool manipulation
7467 
7468 void MacroAssembler::movbool(Register dst, Address src) {
7469   if(sizeof(bool) == 1)
7470     movb(dst, src);
7471   else if(sizeof(bool) == 2)
7472     movw(dst, src);
7473   else if(sizeof(bool) == 4)
7474     movl(dst, src);
7475   else
7476     // unsupported
7477     ShouldNotReachHere();
7478 }
7479 
7480 void MacroAssembler::movbool(Address dst, bool boolconst) {
7481   if(sizeof(bool) == 1)
7482     movb(dst, (int) boolconst);
7483   else if(sizeof(bool) == 2)
7484     movw(dst, (int) boolconst);
7485   else if(sizeof(bool) == 4)
7486     movl(dst, (int) boolconst);
7487   else
7488     // unsupported
7489     ShouldNotReachHere();
7490 }
7491 
7492 void MacroAssembler::movbool(Address dst, Register src) {
7493   if(sizeof(bool) == 1)
7494     movb(dst, src);
7495   else if(sizeof(bool) == 2)
7496     movw(dst, src);
7497   else if(sizeof(bool) == 4)
7498     movl(dst, src);
7499   else
7500     // unsupported
7501     ShouldNotReachHere();
7502 }
7503 
7504 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
7505   movb(as_Address(dst), src);
7506 }
7507 
7508 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
7509   if (reachable(src)) {
7510     movdl(dst, as_Address(src));
7511   } else {
7512     lea(rscratch1, src);
7513     movdl(dst, Address(rscratch1, 0));
7514   }
7515 }
7516 
7517 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
7518   if (reachable(src)) {
7519     movq(dst, as_Address(src));
7520   } else {
7521     lea(rscratch1, src);
7522     movq(dst, Address(rscratch1, 0));
7523   }
7524 }
7525 
7526 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
7527   if (reachable(src)) {
7528     if (UseXmmLoadAndClearUpper) {
7529       movsd (dst, as_Address(src));
7530     } else {
7531       movlpd(dst, as_Address(src));
7532     }
7533   } else {
7534     lea(rscratch1, src);
7535     if (UseXmmLoadAndClearUpper) {
7536       movsd (dst, Address(rscratch1, 0));
7537     } else {
7538       movlpd(dst, Address(rscratch1, 0));
7539     }
7540   }
7541 }
7542 
7543 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
7544   if (reachable(src)) {
7545     movss(dst, as_Address(src));
7546   } else {
7547     lea(rscratch1, src);
7548     movss(dst, Address(rscratch1, 0));
7549   }
7550 }
7551 
7552 void MacroAssembler::movptr(Register dst, Register src) {
7553   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
7554 }
7555 
7556 void MacroAssembler::movptr(Register dst, Address src) {
7557   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
7558 }
7559 
7560 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
7561 void MacroAssembler::movptr(Register dst, intptr_t src) {
7562   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
7563 }
7564 
7565 void MacroAssembler::movptr(Address dst, Register src) {
7566   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
7567 }
7568 
7569 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
7570   if (reachable(src)) {
7571     Assembler::movsd(dst, as_Address(src));
7572   } else {
7573     lea(rscratch1, src);
7574     Assembler::movsd(dst, Address(rscratch1, 0));
7575   }
7576 }
7577 
7578 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
7579   if (reachable(src)) {
7580     Assembler::movss(dst, as_Address(src));
7581   } else {
7582     lea(rscratch1, src);
7583     Assembler::movss(dst, Address(rscratch1, 0));
7584   }
7585 }
7586 
7587 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
7588   if (reachable(src)) {
7589     Assembler::mulsd(dst, as_Address(src));
7590   } else {
7591     lea(rscratch1, src);
7592     Assembler::mulsd(dst, Address(rscratch1, 0));
7593   }
7594 }
7595 
7596 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
7597   if (reachable(src)) {
7598     Assembler::mulss(dst, as_Address(src));
7599   } else {
7600     lea(rscratch1, src);
7601     Assembler::mulss(dst, Address(rscratch1, 0));
7602   }
7603 }
7604 
7605 void MacroAssembler::null_check(Register reg, int offset) {
7606   if (needs_explicit_null_check(offset)) {
7607     // provoke OS NULL exception if reg = NULL by
7608     // accessing M[reg] w/o changing any (non-CC) registers
7609     // NOTE: cmpl is plenty here to provoke a segv
7610     cmpptr(rax, Address(reg, 0));
7611     // Note: should probably use testl(rax, Address(reg, 0));
7612     //       may be shorter code (however, this version of
7613     //       testl needs to be implemented first)
7614   } else {
7615     // nothing to do, (later) access of M[reg + offset]
7616     // will provoke OS NULL exception if reg = NULL
7617   }
7618 }
7619 
7620 void MacroAssembler::os_breakpoint() {
7621   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
7622   // (e.g., MSVC can't call ps() otherwise)
7623   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
7624 }
7625 
7626 void MacroAssembler::pop_CPU_state() {
7627   pop_FPU_state();
7628   pop_IU_state();
7629 }
7630 
7631 void MacroAssembler::pop_FPU_state() {
7632   NOT_LP64(frstor(Address(rsp, 0));)
7633   LP64_ONLY(fxrstor(Address(rsp, 0));)
7634   addptr(rsp, FPUStateSizeInWords * wordSize);
7635 }
7636 
7637 void MacroAssembler::pop_IU_state() {
7638   popa();
7639   LP64_ONLY(addq(rsp, 8));
7640   popf();
7641 }
7642 
7643 // Save Integer and Float state
7644 // Warning: Stack must be 16 byte aligned (64bit)
7645 void MacroAssembler::push_CPU_state() {
7646   push_IU_state();
7647   push_FPU_state();
7648 }
7649 
7650 void MacroAssembler::push_FPU_state() {
7651   subptr(rsp, FPUStateSizeInWords * wordSize);
7652 #ifndef _LP64
7653   fnsave(Address(rsp, 0));
7654   fwait();
7655 #else
7656   fxsave(Address(rsp, 0));
7657 #endif // LP64
7658 }
7659 
7660 void MacroAssembler::push_IU_state() {
7661   // Push flags first because pusha kills them
7662   pushf();
7663   // Make sure rsp stays 16-byte aligned
7664   LP64_ONLY(subq(rsp, 8));
7665   pusha();
7666 }
7667 
7668 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp, bool clear_pc) {
7669   // determine java_thread register
7670   if (!java_thread->is_valid()) {
7671     java_thread = rdi;
7672     get_thread(java_thread);
7673   }
7674   // we must set sp to zero to clear frame
7675   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
7676   if (clear_fp) {
7677     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
7678   }
7679 
7680   if (clear_pc)
7681     movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
7682 
7683 }
7684 
7685 void MacroAssembler::restore_rax(Register tmp) {
7686   if (tmp == noreg) pop(rax);
7687   else if (tmp != rax) mov(rax, tmp);
7688 }
7689 
7690 void MacroAssembler::round_to(Register reg, int modulus) {
7691   addptr(reg, modulus - 1);
7692   andptr(reg, -modulus);
7693 }
7694 
7695 void MacroAssembler::save_rax(Register tmp) {
7696   if (tmp == noreg) push(rax);
7697   else if (tmp != rax) mov(tmp, rax);
7698 }
7699 
7700 // Write serialization page so VM thread can do a pseudo remote membar.
7701 // We use the current thread pointer to calculate a thread specific
7702 // offset to write to within the page. This minimizes bus traffic
7703 // due to cache line collision.
7704 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
7705   movl(tmp, thread);
7706   shrl(tmp, os::get_serialize_page_shift_count());
7707   andl(tmp, (os::vm_page_size() - sizeof(int)));
7708 
7709   Address index(noreg, tmp, Address::times_1);
7710   ExternalAddress page(os::get_memory_serialize_page());
7711 
7712   // Size of store must match masking code above
7713   movl(as_Address(ArrayAddress(page, index)), tmp);
7714 }
7715 
7716 // Calls to C land
7717 //
7718 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
7719 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
7720 // has to be reset to 0. This is required to allow proper stack traversal.
7721 void MacroAssembler::set_last_Java_frame(Register java_thread,
7722                                          Register last_java_sp,
7723                                          Register last_java_fp,
7724                                          address  last_java_pc) {
7725   // determine java_thread register
7726   if (!java_thread->is_valid()) {
7727     java_thread = rdi;
7728     get_thread(java_thread);
7729   }
7730   // determine last_java_sp register
7731   if (!last_java_sp->is_valid()) {
7732     last_java_sp = rsp;
7733   }
7734 
7735   // last_java_fp is optional
7736 
7737   if (last_java_fp->is_valid()) {
7738     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
7739   }
7740 
7741   // last_java_pc is optional
7742 
7743   if (last_java_pc != NULL) {
7744     lea(Address(java_thread,
7745                  JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
7746         InternalAddress(last_java_pc));
7747 
7748   }
7749   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
7750 }
7751 
7752 void MacroAssembler::shlptr(Register dst, int imm8) {
7753   LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
7754 }
7755 
7756 void MacroAssembler::shrptr(Register dst, int imm8) {
7757   LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
7758 }
7759 
7760 void MacroAssembler::sign_extend_byte(Register reg) {
7761   if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
7762     movsbl(reg, reg); // movsxb
7763   } else {
7764     shll(reg, 24);
7765     sarl(reg, 24);
7766   }
7767 }
7768 
7769 void MacroAssembler::sign_extend_short(Register reg) {
7770   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
7771     movswl(reg, reg); // movsxw
7772   } else {
7773     shll(reg, 16);
7774     sarl(reg, 16);
7775   }
7776 }
7777 
7778 void MacroAssembler::testl(Register dst, AddressLiteral src) {
7779   assert(reachable(src), "Address should be reachable");
7780   testl(dst, as_Address(src));
7781 }
7782 
7783 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
7784   if (reachable(src)) {
7785     Assembler::sqrtsd(dst, as_Address(src));
7786   } else {
7787     lea(rscratch1, src);
7788     Assembler::sqrtsd(dst, Address(rscratch1, 0));
7789   }
7790 }
7791 
7792 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
7793   if (reachable(src)) {
7794     Assembler::sqrtss(dst, as_Address(src));
7795   } else {
7796     lea(rscratch1, src);
7797     Assembler::sqrtss(dst, Address(rscratch1, 0));
7798   }
7799 }
7800 
7801 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
7802   if (reachable(src)) {
7803     Assembler::subsd(dst, as_Address(src));
7804   } else {
7805     lea(rscratch1, src);
7806     Assembler::subsd(dst, Address(rscratch1, 0));
7807   }
7808 }
7809 
7810 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
7811   if (reachable(src)) {
7812     Assembler::subss(dst, as_Address(src));
7813   } else {
7814     lea(rscratch1, src);
7815     Assembler::subss(dst, Address(rscratch1, 0));
7816   }
7817 }
7818 
7819 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
7820   if (reachable(src)) {
7821     Assembler::ucomisd(dst, as_Address(src));
7822   } else {
7823     lea(rscratch1, src);
7824     Assembler::ucomisd(dst, Address(rscratch1, 0));
7825   }
7826 }
7827 
7828 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
7829   if (reachable(src)) {
7830     Assembler::ucomiss(dst, as_Address(src));
7831   } else {
7832     lea(rscratch1, src);
7833     Assembler::ucomiss(dst, Address(rscratch1, 0));
7834   }
7835 }
7836 
7837 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
7838   // Used in sign-bit flipping with aligned address.
7839   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
7840   if (reachable(src)) {
7841     Assembler::xorpd(dst, as_Address(src));
7842   } else {
7843     lea(rscratch1, src);
7844     Assembler::xorpd(dst, Address(rscratch1, 0));
7845   }
7846 }
7847 
7848 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
7849   // Used in sign-bit flipping with aligned address.
7850   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
7851   if (reachable(src)) {
7852     Assembler::xorps(dst, as_Address(src));
7853   } else {
7854     lea(rscratch1, src);
7855     Assembler::xorps(dst, Address(rscratch1, 0));
7856   }
7857 }
7858 
7859 // AVX 3-operands instructions
7860 
7861 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7862   if (reachable(src)) {
7863     vaddsd(dst, nds, as_Address(src));
7864   } else {
7865     lea(rscratch1, src);
7866     vaddsd(dst, nds, Address(rscratch1, 0));
7867   }
7868 }
7869 
7870 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7871   if (reachable(src)) {
7872     vaddss(dst, nds, as_Address(src));
7873   } else {
7874     lea(rscratch1, src);
7875     vaddss(dst, nds, Address(rscratch1, 0));
7876   }
7877 }
7878 
7879 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7880   if (reachable(src)) {
7881     vandpd(dst, nds, as_Address(src));
7882   } else {
7883     lea(rscratch1, src);
7884     vandpd(dst, nds, Address(rscratch1, 0));
7885   }
7886 }
7887 
7888 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7889   if (reachable(src)) {
7890     vandps(dst, nds, as_Address(src));
7891   } else {
7892     lea(rscratch1, src);
7893     vandps(dst, nds, Address(rscratch1, 0));
7894   }
7895 }
7896 
7897 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7898   if (reachable(src)) {
7899     vdivsd(dst, nds, as_Address(src));
7900   } else {
7901     lea(rscratch1, src);
7902     vdivsd(dst, nds, Address(rscratch1, 0));
7903   }
7904 }
7905 
7906 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7907   if (reachable(src)) {
7908     vdivss(dst, nds, as_Address(src));
7909   } else {
7910     lea(rscratch1, src);
7911     vdivss(dst, nds, Address(rscratch1, 0));
7912   }
7913 }
7914 
7915 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7916   if (reachable(src)) {
7917     vmulsd(dst, nds, as_Address(src));
7918   } else {
7919     lea(rscratch1, src);
7920     vmulsd(dst, nds, Address(rscratch1, 0));
7921   }
7922 }
7923 
7924 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7925   if (reachable(src)) {
7926     vmulss(dst, nds, as_Address(src));
7927   } else {
7928     lea(rscratch1, src);
7929     vmulss(dst, nds, Address(rscratch1, 0));
7930   }
7931 }
7932 
7933 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7934   if (reachable(src)) {
7935     vsubsd(dst, nds, as_Address(src));
7936   } else {
7937     lea(rscratch1, src);
7938     vsubsd(dst, nds, Address(rscratch1, 0));
7939   }
7940 }
7941 
7942 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7943   if (reachable(src)) {
7944     vsubss(dst, nds, as_Address(src));
7945   } else {
7946     lea(rscratch1, src);
7947     vsubss(dst, nds, Address(rscratch1, 0));
7948   }
7949 }
7950 
7951 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7952   if (reachable(src)) {
7953     vxorpd(dst, nds, as_Address(src));
7954   } else {
7955     lea(rscratch1, src);
7956     vxorpd(dst, nds, Address(rscratch1, 0));
7957   }
7958 }
7959 
7960 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7961   if (reachable(src)) {
7962     vxorps(dst, nds, as_Address(src));
7963   } else {
7964     lea(rscratch1, src);
7965     vxorps(dst, nds, Address(rscratch1, 0));
7966   }
7967 }
7968 
7969 
7970 //////////////////////////////////////////////////////////////////////////////////
7971 #ifndef SERIALGC
7972 
7973 void MacroAssembler::g1_write_barrier_pre(Register obj,
7974                                           Register pre_val,
7975                                           Register thread,
7976                                           Register tmp,
7977                                           bool tosca_live,
7978                                           bool expand_call) {
7979 
7980   // If expand_call is true then we expand the call_VM_leaf macro
7981   // directly to skip generating the check by
7982   // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
7983 
7984 #ifdef _LP64
7985   assert(thread == r15_thread, "must be");
7986 #endif // _LP64
7987 
7988   Label done;
7989   Label runtime;
7990 
7991   assert(pre_val != noreg, "check this code");
7992 
7993   if (obj != noreg) {
7994     assert_different_registers(obj, pre_val, tmp);
7995     assert(pre_val != rax, "check this code");
7996   }
7997 
7998   Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
7999                                        PtrQueue::byte_offset_of_active()));
8000   Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
8001                                        PtrQueue::byte_offset_of_index()));
8002   Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
8003                                        PtrQueue::byte_offset_of_buf()));
8004 
8005 
8006   // Is marking active?
8007   if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
8008     cmpl(in_progress, 0);
8009   } else {
8010     assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
8011     cmpb(in_progress, 0);
8012   }
8013   jcc(Assembler::equal, done);
8014 
8015   // Do we need to load the previous value?
8016   if (obj != noreg) {
8017     load_heap_oop(pre_val, Address(obj, 0));
8018   }
8019 
8020   // Is the previous value null?
8021   cmpptr(pre_val, (int32_t) NULL_WORD);
8022   jcc(Assembler::equal, done);
8023 
8024   // Can we store original value in the thread's buffer?
8025   // Is index == 0?
8026   // (The index field is typed as size_t.)
8027 
8028   movptr(tmp, index);                   // tmp := *index_adr
8029   cmpptr(tmp, 0);                       // tmp == 0?
8030   jcc(Assembler::equal, runtime);       // If yes, goto runtime
8031 
8032   subptr(tmp, wordSize);                // tmp := tmp - wordSize
8033   movptr(index, tmp);                   // *index_adr := tmp
8034   addptr(tmp, buffer);                  // tmp := tmp + *buffer_adr
8035 
8036   // Record the previous value
8037   movptr(Address(tmp, 0), pre_val);
8038   jmp(done);
8039 
8040   bind(runtime);
8041   // save the live input values
8042   if(tosca_live) push(rax);
8043 
8044   if (obj != noreg && obj != rax)
8045     push(obj);
8046 
8047   if (pre_val != rax)
8048     push(pre_val);
8049 
8050   // Calling the runtime using the regular call_VM_leaf mechanism generates
8051   // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
8052   // that checks that the *(ebp+frame::interpreter_frame_last_sp) == NULL.
8053   //
8054   // If we care generating the pre-barrier without a frame (e.g. in the
8055   // intrinsified Reference.get() routine) then ebp might be pointing to
8056   // the caller frame and so this check will most likely fail at runtime.
8057   //
8058   // Expanding the call directly bypasses the generation of the check.
8059   // So when we do not have have a full interpreter frame on the stack
8060   // expand_call should be passed true.
8061 
8062   NOT_LP64( push(thread); )
8063 
8064   if (expand_call) {
8065     LP64_ONLY( assert(pre_val != c_rarg1, "smashed arg"); )
8066     pass_arg1(this, thread);
8067     pass_arg0(this, pre_val);
8068     MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
8069   } else {
8070     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
8071   }
8072 
8073   NOT_LP64( pop(thread); )
8074 
8075   // save the live input values
8076   if (pre_val != rax)
8077     pop(pre_val);
8078 
8079   if (obj != noreg && obj != rax)
8080     pop(obj);
8081 
8082   if(tosca_live) pop(rax);
8083 
8084   bind(done);
8085 }
8086 
8087 void MacroAssembler::g1_write_barrier_post(Register store_addr,
8088                                            Register new_val,
8089                                            Register thread,
8090                                            Register tmp,
8091                                            Register tmp2) {
8092 #ifdef _LP64
8093   assert(thread == r15_thread, "must be");
8094 #endif // _LP64
8095 
8096   Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
8097                                        PtrQueue::byte_offset_of_index()));
8098   Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
8099                                        PtrQueue::byte_offset_of_buf()));
8100 
8101   BarrierSet* bs = Universe::heap()->barrier_set();
8102   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
8103   Label done;
8104   Label runtime;
8105 
8106   // Does store cross heap regions?
8107 
8108   movptr(tmp, store_addr);
8109   xorptr(tmp, new_val);
8110   shrptr(tmp, HeapRegion::LogOfHRGrainBytes);
8111   jcc(Assembler::equal, done);
8112 
8113   // crosses regions, storing NULL?
8114 
8115   cmpptr(new_val, (int32_t) NULL_WORD);
8116   jcc(Assembler::equal, done);
8117 
8118   // storing region crossing non-NULL, is card already dirty?
8119 
8120   ExternalAddress cardtable((address) ct->byte_map_base);
8121   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
8122 #ifdef _LP64
8123   const Register card_addr = tmp;
8124 
8125   movq(card_addr, store_addr);
8126   shrq(card_addr, CardTableModRefBS::card_shift);
8127 
8128   lea(tmp2, cardtable);
8129 
8130   // get the address of the card
8131   addq(card_addr, tmp2);
8132 #else
8133   const Register card_index = tmp;
8134 
8135   movl(card_index, store_addr);
8136   shrl(card_index, CardTableModRefBS::card_shift);
8137 
8138   Address index(noreg, card_index, Address::times_1);
8139   const Register card_addr = tmp;
8140   lea(card_addr, as_Address(ArrayAddress(cardtable, index)));
8141 #endif
8142   cmpb(Address(card_addr, 0), 0);
8143   jcc(Assembler::equal, done);
8144 
8145   // storing a region crossing, non-NULL oop, card is clean.
8146   // dirty card and log.
8147 
8148   movb(Address(card_addr, 0), 0);
8149 
8150   cmpl(queue_index, 0);
8151   jcc(Assembler::equal, runtime);
8152   subl(queue_index, wordSize);
8153   movptr(tmp2, buffer);
8154 #ifdef _LP64
8155   movslq(rscratch1, queue_index);
8156   addq(tmp2, rscratch1);
8157   movq(Address(tmp2, 0), card_addr);
8158 #else
8159   addl(tmp2, queue_index);
8160   movl(Address(tmp2, 0), card_index);
8161 #endif
8162   jmp(done);
8163 
8164   bind(runtime);
8165   // save the live input values
8166   push(store_addr);
8167   push(new_val);
8168 #ifdef _LP64
8169   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, r15_thread);
8170 #else
8171   push(thread);
8172   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
8173   pop(thread);
8174 #endif
8175   pop(new_val);
8176   pop(store_addr);
8177 
8178   bind(done);
8179 }
8180 
8181 #endif // SERIALGC
8182 //////////////////////////////////////////////////////////////////////////////////
8183 
8184 
8185 void MacroAssembler::store_check(Register obj) {
8186   // Does a store check for the oop in register obj. The content of
8187   // register obj is destroyed afterwards.
8188   store_check_part_1(obj);
8189   store_check_part_2(obj);
8190 }
8191 
8192 void MacroAssembler::store_check(Register obj, Address dst) {
8193   store_check(obj);
8194 }
8195 
8196 
8197 // split the store check operation so that other instructions can be scheduled inbetween
8198 void MacroAssembler::store_check_part_1(Register obj) {
8199   BarrierSet* bs = Universe::heap()->barrier_set();
8200   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
8201   shrptr(obj, CardTableModRefBS::card_shift);
8202 }
8203 
8204 void MacroAssembler::store_check_part_2(Register obj) {
8205   BarrierSet* bs = Universe::heap()->barrier_set();
8206   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
8207   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
8208   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
8209 
8210   // The calculation for byte_map_base is as follows:
8211   // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
8212   // So this essentially converts an address to a displacement and
8213   // it will never need to be relocated. On 64bit however the value may be too
8214   // large for a 32bit displacement
8215 
8216   intptr_t disp = (intptr_t) ct->byte_map_base;
8217   if (is_simm32(disp)) {
8218     Address cardtable(noreg, obj, Address::times_1, disp);
8219     movb(cardtable, 0);
8220   } else {
8221     // By doing it as an ExternalAddress disp could be converted to a rip-relative
8222     // displacement and done in a single instruction given favorable mapping and
8223     // a smarter version of as_Address. Worst case it is two instructions which
8224     // is no worse off then loading disp into a register and doing as a simple
8225     // Address() as above.
8226     // We can't do as ExternalAddress as the only style since if disp == 0 we'll
8227     // assert since NULL isn't acceptable in a reloci (see 6644928). In any case
8228     // in some cases we'll get a single instruction version.
8229 
8230     ExternalAddress cardtable((address)disp);
8231     Address index(noreg, obj, Address::times_1);
8232     movb(as_Address(ArrayAddress(cardtable, index)), 0);
8233   }
8234 }
8235 
8236 void MacroAssembler::subptr(Register dst, int32_t imm32) {
8237   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
8238 }
8239 
8240 // Force generation of a 4 byte immediate value even if it fits into 8bit
8241 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
8242   LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
8243 }
8244 
8245 void MacroAssembler::subptr(Register dst, Register src) {
8246   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
8247 }
8248 
8249 // C++ bool manipulation
8250 void MacroAssembler::testbool(Register dst) {
8251   if(sizeof(bool) == 1)
8252     testb(dst, 0xff);
8253   else if(sizeof(bool) == 2) {
8254     // testw implementation needed for two byte bools
8255     ShouldNotReachHere();
8256   } else if(sizeof(bool) == 4)
8257     testl(dst, dst);
8258   else
8259     // unsupported
8260     ShouldNotReachHere();
8261 }
8262 
8263 void MacroAssembler::testptr(Register dst, Register src) {
8264   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
8265 }
8266 
8267 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
8268 void MacroAssembler::tlab_allocate(Register obj,
8269                                    Register var_size_in_bytes,
8270                                    int con_size_in_bytes,
8271                                    Register t1,
8272                                    Register t2,
8273                                    Label& slow_case) {
8274   assert_different_registers(obj, t1, t2);
8275   assert_different_registers(obj, var_size_in_bytes, t1);
8276   Register end = t2;
8277   Register thread = NOT_LP64(t1) LP64_ONLY(r15_thread);
8278 
8279   verify_tlab();
8280 
8281   NOT_LP64(get_thread(thread));
8282 
8283   movptr(obj, Address(thread, JavaThread::tlab_top_offset()));
8284   if (var_size_in_bytes == noreg) {
8285     lea(end, Address(obj, con_size_in_bytes));
8286   } else {
8287     lea(end, Address(obj, var_size_in_bytes, Address::times_1));
8288   }
8289   cmpptr(end, Address(thread, JavaThread::tlab_end_offset()));
8290   jcc(Assembler::above, slow_case);
8291 
8292   // update the tlab top pointer
8293   movptr(Address(thread, JavaThread::tlab_top_offset()), end);
8294 
8295   // recover var_size_in_bytes if necessary
8296   if (var_size_in_bytes == end) {
8297     subptr(var_size_in_bytes, obj);
8298   }
8299   verify_tlab();
8300 }
8301 
8302 // Preserves rbx, and rdx.
8303 Register MacroAssembler::tlab_refill(Label& retry,
8304                                      Label& try_eden,
8305                                      Label& slow_case) {
8306   Register top = rax;
8307   Register t1  = rcx;
8308   Register t2  = rsi;
8309   Register thread_reg = NOT_LP64(rdi) LP64_ONLY(r15_thread);
8310   assert_different_registers(top, thread_reg, t1, t2, /* preserve: */ rbx, rdx);
8311   Label do_refill, discard_tlab;
8312 
8313   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
8314     // No allocation in the shared eden.
8315     jmp(slow_case);
8316   }
8317 
8318   NOT_LP64(get_thread(thread_reg));
8319 
8320   movptr(top, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
8321   movptr(t1,  Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
8322 
8323   // calculate amount of free space
8324   subptr(t1, top);
8325   shrptr(t1, LogHeapWordSize);
8326 
8327   // Retain tlab and allocate object in shared space if
8328   // the amount free in the tlab is too large to discard.
8329   cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
8330   jcc(Assembler::lessEqual, discard_tlab);
8331 
8332   // Retain
8333   // %%% yuck as movptr...
8334   movptr(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
8335   addptr(Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())), t2);
8336   if (TLABStats) {
8337     // increment number of slow_allocations
8338     addl(Address(thread_reg, in_bytes(JavaThread::tlab_slow_allocations_offset())), 1);
8339   }
8340   jmp(try_eden);
8341 
8342   bind(discard_tlab);
8343   if (TLABStats) {
8344     // increment number of refills
8345     addl(Address(thread_reg, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1);
8346     // accumulate wastage -- t1 is amount free in tlab
8347     addl(Address(thread_reg, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1);
8348   }
8349 
8350   // if tlab is currently allocated (top or end != null) then
8351   // fill [top, end + alignment_reserve) with array object
8352   testptr(top, top);
8353   jcc(Assembler::zero, do_refill);
8354 
8355   // set up the mark word
8356   movptr(Address(top, oopDesc::mark_offset_in_bytes()), (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
8357   // set the length to the remaining space
8358   subptr(t1, typeArrayOopDesc::header_size(T_INT));
8359   addptr(t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
8360   shlptr(t1, log2_intptr(HeapWordSize/sizeof(jint)));
8361   movl(Address(top, arrayOopDesc::length_offset_in_bytes()), t1);
8362   // set klass to intArrayKlass
8363   // dubious reloc why not an oop reloc?
8364   movptr(t1, ExternalAddress((address)Universe::intArrayKlassObj_addr()));
8365   // store klass last.  concurrent gcs assumes klass length is valid if
8366   // klass field is not null.
8367   store_klass(top, t1);
8368 
8369   movptr(t1, top);
8370   subptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
8371   incr_allocated_bytes(thread_reg, t1, 0);
8372 
8373   // refill the tlab with an eden allocation
8374   bind(do_refill);
8375   movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
8376   shlptr(t1, LogHeapWordSize);
8377   // allocate new tlab, address returned in top
8378   eden_allocate(top, t1, 0, t2, slow_case);
8379 
8380   // Check that t1 was preserved in eden_allocate.
8381 #ifdef ASSERT
8382   if (UseTLAB) {
8383     Label ok;
8384     Register tsize = rsi;
8385     assert_different_registers(tsize, thread_reg, t1);
8386     push(tsize);
8387     movptr(tsize, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
8388     shlptr(tsize, LogHeapWordSize);
8389     cmpptr(t1, tsize);
8390     jcc(Assembler::equal, ok);
8391     stop("assert(t1 != tlab size)");
8392     should_not_reach_here();
8393 
8394     bind(ok);
8395     pop(tsize);
8396   }
8397 #endif
8398   movptr(Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())), top);
8399   movptr(Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())), top);
8400   addptr(top, t1);
8401   subptr(top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
8402   movptr(Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())), top);
8403   verify_tlab();
8404   jmp(retry);
8405 
8406   return thread_reg; // for use by caller
8407 }
8408 
8409 void MacroAssembler::incr_allocated_bytes(Register thread,
8410                                           Register var_size_in_bytes,
8411                                           int con_size_in_bytes,
8412                                           Register t1) {
8413   if (!thread->is_valid()) {
8414 #ifdef _LP64
8415     thread = r15_thread;
8416 #else
8417     assert(t1->is_valid(), "need temp reg");
8418     thread = t1;
8419     get_thread(thread);
8420 #endif
8421   }
8422 
8423 #ifdef _LP64
8424   if (var_size_in_bytes->is_valid()) {
8425     addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
8426   } else {
8427     addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
8428   }
8429 #else
8430   if (var_size_in_bytes->is_valid()) {
8431     addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
8432   } else {
8433     addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
8434   }
8435   adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
8436 #endif
8437 }
8438 
8439 void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
8440   pusha();
8441 
8442   // if we are coming from c1, xmm registers may be live
8443   if (UseSSE >= 1) {
8444     subptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8));
8445   }
8446   int off = 0;
8447   if (UseSSE == 1)  {
8448     movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
8449     movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
8450     movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
8451     movflt(Address(rsp,off++*sizeof(jdouble)),xmm3);
8452     movflt(Address(rsp,off++*sizeof(jdouble)),xmm4);
8453     movflt(Address(rsp,off++*sizeof(jdouble)),xmm5);
8454     movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
8455     movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
8456   } else if (UseSSE >= 2)  {
8457     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm0);
8458     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm1);
8459     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm2);
8460     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm3);
8461     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm4);
8462     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm5);
8463     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm6);
8464     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm7);
8465 #ifdef _LP64
8466     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm8);
8467     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm9);
8468     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm10);
8469     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm11);
8470     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm12);
8471     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm13);
8472     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm14);
8473     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm15);
8474 #endif
8475   }
8476 
8477   // Preserve registers across runtime call
8478   int incoming_argument_and_return_value_offset = -1;
8479   if (num_fpu_regs_in_use > 1) {
8480     // Must preserve all other FPU regs (could alternatively convert
8481     // SharedRuntime::dsin, dcos etc. into assembly routines known not to trash
8482     // FPU state, but can not trust C compiler)
8483     NEEDS_CLEANUP;
8484     // NOTE that in this case we also push the incoming argument(s) to
8485     // the stack and restore it later; we also use this stack slot to
8486     // hold the return value from dsin, dcos etc.
8487     for (int i = 0; i < num_fpu_regs_in_use; i++) {
8488       subptr(rsp, sizeof(jdouble));
8489       fstp_d(Address(rsp, 0));
8490     }
8491     incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
8492     for (int i = nb_args-1; i >= 0; i--) {
8493       fld_d(Address(rsp, incoming_argument_and_return_value_offset-i*sizeof(jdouble)));
8494     }
8495   }
8496 
8497   subptr(rsp, nb_args*sizeof(jdouble));
8498   for (int i = 0; i < nb_args; i++) {
8499     fstp_d(Address(rsp, i*sizeof(jdouble)));
8500   }
8501 
8502 #ifdef _LP64
8503   if (nb_args > 0) {
8504     movdbl(xmm0, Address(rsp, 0));
8505   }
8506   if (nb_args > 1) {
8507     movdbl(xmm1, Address(rsp, sizeof(jdouble)));
8508   }
8509   assert(nb_args <= 2, "unsupported number of args");
8510 #endif // _LP64
8511 
8512   // NOTE: we must not use call_VM_leaf here because that requires a
8513   // complete interpreter frame in debug mode -- same bug as 4387334
8514   // MacroAssembler::call_VM_leaf_base is perfectly safe and will
8515   // do proper 64bit abi
8516 
8517   NEEDS_CLEANUP;
8518   // Need to add stack banging before this runtime call if it needs to
8519   // be taken; however, there is no generic stack banging routine at
8520   // the MacroAssembler level
8521 
8522   MacroAssembler::call_VM_leaf_base(runtime_entry, 0);
8523 
8524 #ifdef _LP64
8525   movsd(Address(rsp, 0), xmm0);
8526   fld_d(Address(rsp, 0));
8527 #endif // _LP64
8528   addptr(rsp, sizeof(jdouble) * nb_args);
8529   if (num_fpu_regs_in_use > 1) {
8530     // Must save return value to stack and then restore entire FPU
8531     // stack except incoming arguments
8532     fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
8533     for (int i = 0; i < num_fpu_regs_in_use - nb_args; i++) {
8534       fld_d(Address(rsp, 0));
8535       addptr(rsp, sizeof(jdouble));
8536     }
8537     fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
8538     addptr(rsp, sizeof(jdouble) * nb_args);
8539   }
8540 
8541   off = 0;
8542   if (UseSSE == 1)  {
8543     movflt(xmm0, Address(rsp,off++*sizeof(jdouble)));
8544     movflt(xmm1, Address(rsp,off++*sizeof(jdouble)));
8545     movflt(xmm2, Address(rsp,off++*sizeof(jdouble)));
8546     movflt(xmm3, Address(rsp,off++*sizeof(jdouble)));
8547     movflt(xmm4, Address(rsp,off++*sizeof(jdouble)));
8548     movflt(xmm5, Address(rsp,off++*sizeof(jdouble)));
8549     movflt(xmm6, Address(rsp,off++*sizeof(jdouble)));
8550     movflt(xmm7, Address(rsp,off++*sizeof(jdouble)));
8551   } else if (UseSSE >= 2)  {
8552     movdbl(xmm0, Address(rsp,off++*sizeof(jdouble)));
8553     movdbl(xmm1, Address(rsp,off++*sizeof(jdouble)));
8554     movdbl(xmm2, Address(rsp,off++*sizeof(jdouble)));
8555     movdbl(xmm3, Address(rsp,off++*sizeof(jdouble)));
8556     movdbl(xmm4, Address(rsp,off++*sizeof(jdouble)));
8557     movdbl(xmm5, Address(rsp,off++*sizeof(jdouble)));
8558     movdbl(xmm6, Address(rsp,off++*sizeof(jdouble)));
8559     movdbl(xmm7, Address(rsp,off++*sizeof(jdouble)));
8560 #ifdef _LP64
8561     movdbl(xmm8, Address(rsp,off++*sizeof(jdouble)));
8562     movdbl(xmm9, Address(rsp,off++*sizeof(jdouble)));
8563     movdbl(xmm10, Address(rsp,off++*sizeof(jdouble)));
8564     movdbl(xmm11, Address(rsp,off++*sizeof(jdouble)));
8565     movdbl(xmm12, Address(rsp,off++*sizeof(jdouble)));
8566     movdbl(xmm13, Address(rsp,off++*sizeof(jdouble)));
8567     movdbl(xmm14, Address(rsp,off++*sizeof(jdouble)));
8568     movdbl(xmm15, Address(rsp,off++*sizeof(jdouble)));
8569 #endif
8570   }
8571   if (UseSSE >= 1) {
8572     addptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8));
8573   }
8574   popa();
8575 }
8576 
8577 static const double     pi_4 =  0.7853981633974483;
8578 
8579 void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
8580   // A hand-coded argument reduction for values in fabs(pi/4, pi/2)
8581   // was attempted in this code; unfortunately it appears that the
8582   // switch to 80-bit precision and back causes this to be
8583   // unprofitable compared with simply performing a runtime call if
8584   // the argument is out of the (-pi/4, pi/4) range.
8585 
8586   Register tmp = noreg;
8587   if (!VM_Version::supports_cmov()) {
8588     // fcmp needs a temporary so preserve rbx,
8589     tmp = rbx;
8590     push(tmp);
8591   }
8592 
8593   Label slow_case, done;
8594 
8595   ExternalAddress pi4_adr = (address)&pi_4;
8596   if (reachable(pi4_adr)) {
8597     // x ?<= pi/4
8598     fld_d(pi4_adr);
8599     fld_s(1);                // Stack:  X  PI/4  X
8600     fabs();                  // Stack: |X| PI/4  X
8601     fcmp(tmp);
8602     jcc(Assembler::above, slow_case);
8603 
8604     // fastest case: -pi/4 <= x <= pi/4
8605     switch(trig) {
8606     case 's':
8607       fsin();
8608       break;
8609     case 'c':
8610       fcos();
8611       break;
8612     case 't':
8613       ftan();
8614       break;
8615     default:
8616       assert(false, "bad intrinsic");
8617       break;
8618     }
8619     jmp(done);
8620   }
8621 
8622   // slow case: runtime call
8623   bind(slow_case);
8624 
8625   switch(trig) {
8626   case 's':
8627     {
8628       fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 1, num_fpu_regs_in_use);
8629     }
8630     break;
8631   case 'c':
8632     {
8633       fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 1, num_fpu_regs_in_use);
8634     }
8635     break;
8636   case 't':
8637     {
8638       fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 1, num_fpu_regs_in_use);
8639     }
8640     break;
8641   default:
8642     assert(false, "bad intrinsic");
8643     break;
8644   }
8645 
8646   // Come here with result in F-TOS
8647   bind(done);
8648 
8649   if (tmp != noreg) {
8650     pop(tmp);
8651   }
8652 }
8653 
8654 
8655 // Look up the method for a megamorphic invokeinterface call.
8656 // The target method is determined by <intf_klass, itable_index>.
8657 // The receiver klass is in recv_klass.
8658 // On success, the result will be in method_result, and execution falls through.
8659 // On failure, execution transfers to the given label.
8660 void MacroAssembler::lookup_interface_method(Register recv_klass,
8661                                              Register intf_klass,
8662                                              RegisterOrConstant itable_index,
8663                                              Register method_result,
8664                                              Register scan_temp,
8665                                              Label& L_no_such_interface) {
8666   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
8667   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
8668          "caller must use same register for non-constant itable index as for method");
8669 
8670   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
8671   int vtable_base = instanceKlass::vtable_start_offset() * wordSize;
8672   int itentry_off = itableMethodEntry::method_offset_in_bytes();
8673   int scan_step   = itableOffsetEntry::size() * wordSize;
8674   int vte_size    = vtableEntry::size() * wordSize;
8675   Address::ScaleFactor times_vte_scale = Address::times_ptr;
8676   assert(vte_size == wordSize, "else adjust times_vte_scale");
8677 
8678   movl(scan_temp, Address(recv_klass, instanceKlass::vtable_length_offset() * wordSize));
8679 
8680   // %%% Could store the aligned, prescaled offset in the klassoop.
8681   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
8682   if (HeapWordsPerLong > 1) {
8683     // Round up to align_object_offset boundary
8684     // see code for instanceKlass::start_of_itable!
8685     round_to(scan_temp, BytesPerLong);
8686   }
8687 
8688   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
8689   assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
8690   lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
8691 
8692   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
8693   //   if (scan->interface() == intf) {
8694   //     result = (klass + scan->offset() + itable_index);
8695   //   }
8696   // }
8697   Label search, found_method;
8698 
8699   for (int peel = 1; peel >= 0; peel--) {
8700     movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
8701     cmpptr(intf_klass, method_result);
8702 
8703     if (peel) {
8704       jccb(Assembler::equal, found_method);
8705     } else {
8706       jccb(Assembler::notEqual, search);
8707       // (invert the test to fall through to found_method...)
8708     }
8709 
8710     if (!peel)  break;
8711 
8712     bind(search);
8713 
8714     // Check that the previous entry is non-null.  A null entry means that
8715     // the receiver class doesn't implement the interface, and wasn't the
8716     // same as when the caller was compiled.
8717     testptr(method_result, method_result);
8718     jcc(Assembler::zero, L_no_such_interface);
8719     addptr(scan_temp, scan_step);
8720   }
8721 
8722   bind(found_method);
8723 
8724   // Got a hit.
8725   movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
8726   movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
8727 }
8728 
8729 
8730 void MacroAssembler::check_klass_subtype(Register sub_klass,
8731                            Register super_klass,
8732                            Register temp_reg,
8733                            Label& L_success) {
8734   Label L_failure;
8735   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
8736   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
8737   bind(L_failure);
8738 }
8739 
8740 
8741 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
8742                                                    Register super_klass,
8743                                                    Register temp_reg,
8744                                                    Label* L_success,
8745                                                    Label* L_failure,
8746                                                    Label* L_slow_path,
8747                                         RegisterOrConstant super_check_offset) {
8748   assert_different_registers(sub_klass, super_klass, temp_reg);
8749   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
8750   if (super_check_offset.is_register()) {
8751     assert_different_registers(sub_klass, super_klass,
8752                                super_check_offset.as_register());
8753   } else if (must_load_sco) {
8754     assert(temp_reg != noreg, "supply either a temp or a register offset");
8755   }
8756 
8757   Label L_fallthrough;
8758   int label_nulls = 0;
8759   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
8760   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
8761   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
8762   assert(label_nulls <= 1, "at most one NULL in the batch");
8763 
8764   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
8765   int sco_offset = in_bytes(Klass::super_check_offset_offset());
8766   Address super_check_offset_addr(super_klass, sco_offset);
8767 
8768   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
8769   // range of a jccb.  If this routine grows larger, reconsider at
8770   // least some of these.
8771 #define local_jcc(assembler_cond, label)                                \
8772   if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
8773   else                             jcc( assembler_cond, label) /*omit semi*/
8774 
8775   // Hacked jmp, which may only be used just before L_fallthrough.
8776 #define final_jmp(label)                                                \
8777   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
8778   else                            jmp(label)                /*omit semi*/
8779 
8780   // If the pointers are equal, we are done (e.g., String[] elements).
8781   // This self-check enables sharing of secondary supertype arrays among
8782   // non-primary types such as array-of-interface.  Otherwise, each such
8783   // type would need its own customized SSA.
8784   // We move this check to the front of the fast path because many
8785   // type checks are in fact trivially successful in this manner,
8786   // so we get a nicely predicted branch right at the start of the check.
8787   cmpptr(sub_klass, super_klass);
8788   local_jcc(Assembler::equal, *L_success);
8789 
8790   // Check the supertype display:
8791   if (must_load_sco) {
8792     // Positive movl does right thing on LP64.
8793     movl(temp_reg, super_check_offset_addr);
8794     super_check_offset = RegisterOrConstant(temp_reg);
8795   }
8796   Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
8797   cmpptr(super_klass, super_check_addr); // load displayed supertype
8798 
8799   // This check has worked decisively for primary supers.
8800   // Secondary supers are sought in the super_cache ('super_cache_addr').
8801   // (Secondary supers are interfaces and very deeply nested subtypes.)
8802   // This works in the same check above because of a tricky aliasing
8803   // between the super_cache and the primary super display elements.
8804   // (The 'super_check_addr' can address either, as the case requires.)
8805   // Note that the cache is updated below if it does not help us find
8806   // what we need immediately.
8807   // So if it was a primary super, we can just fail immediately.
8808   // Otherwise, it's the slow path for us (no success at this point).
8809 
8810   if (super_check_offset.is_register()) {
8811     local_jcc(Assembler::equal, *L_success);
8812     cmpl(super_check_offset.as_register(), sc_offset);
8813     if (L_failure == &L_fallthrough) {
8814       local_jcc(Assembler::equal, *L_slow_path);
8815     } else {
8816       local_jcc(Assembler::notEqual, *L_failure);
8817       final_jmp(*L_slow_path);
8818     }
8819   } else if (super_check_offset.as_constant() == sc_offset) {
8820     // Need a slow path; fast failure is impossible.
8821     if (L_slow_path == &L_fallthrough) {
8822       local_jcc(Assembler::equal, *L_success);
8823     } else {
8824       local_jcc(Assembler::notEqual, *L_slow_path);
8825       final_jmp(*L_success);
8826     }
8827   } else {
8828     // No slow path; it's a fast decision.
8829     if (L_failure == &L_fallthrough) {
8830       local_jcc(Assembler::equal, *L_success);
8831     } else {
8832       local_jcc(Assembler::notEqual, *L_failure);
8833       final_jmp(*L_success);
8834     }
8835   }
8836 
8837   bind(L_fallthrough);
8838 
8839 #undef local_jcc
8840 #undef final_jmp
8841 }
8842 
8843 
8844 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
8845                                                    Register super_klass,
8846                                                    Register temp_reg,
8847                                                    Register temp2_reg,
8848                                                    Label* L_success,
8849                                                    Label* L_failure,
8850                                                    bool set_cond_codes) {
8851   assert_different_registers(sub_klass, super_klass, temp_reg);
8852   if (temp2_reg != noreg)
8853     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
8854 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
8855 
8856   Label L_fallthrough;
8857   int label_nulls = 0;
8858   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
8859   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
8860   assert(label_nulls <= 1, "at most one NULL in the batch");
8861 
8862   // a couple of useful fields in sub_klass:
8863   int ss_offset = in_bytes(Klass::secondary_supers_offset());
8864   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
8865   Address secondary_supers_addr(sub_klass, ss_offset);
8866   Address super_cache_addr(     sub_klass, sc_offset);
8867 
8868   // Do a linear scan of the secondary super-klass chain.
8869   // This code is rarely used, so simplicity is a virtue here.
8870   // The repne_scan instruction uses fixed registers, which we must spill.
8871   // Don't worry too much about pre-existing connections with the input regs.
8872 
8873   assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
8874   assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
8875 
8876   // Get super_klass value into rax (even if it was in rdi or rcx).
8877   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
8878   if (super_klass != rax || UseCompressedOops) {
8879     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
8880     mov(rax, super_klass);
8881   }
8882   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
8883   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
8884 
8885 #ifndef PRODUCT
8886   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
8887   ExternalAddress pst_counter_addr((address) pst_counter);
8888   NOT_LP64(  incrementl(pst_counter_addr) );
8889   LP64_ONLY( lea(rcx, pst_counter_addr) );
8890   LP64_ONLY( incrementl(Address(rcx, 0)) );
8891 #endif //PRODUCT
8892 
8893   // We will consult the secondary-super array.
8894   movptr(rdi, secondary_supers_addr);
8895   // Load the array length.  (Positive movl does right thing on LP64.)
8896   movl(rcx, Address(rdi, arrayOopDesc::length_offset_in_bytes()));
8897   // Skip to start of data.
8898   addptr(rdi, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
8899 
8900   // Scan RCX words at [RDI] for an occurrence of RAX.
8901   // Set NZ/Z based on last compare.
8902   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
8903   // not change flags (only scas instruction which is repeated sets flags).
8904   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
8905 #ifdef _LP64
8906   // This part is tricky, as values in supers array could be 32 or 64 bit wide
8907   // and we store values in objArrays always encoded, thus we need to encode
8908   // the value of rax before repne.  Note that rax is dead after the repne.
8909   if (UseCompressedOops) {
8910     encode_heap_oop_not_null(rax); // Changes flags.
8911     // The superclass is never null; it would be a basic system error if a null
8912     // pointer were to sneak in here.  Note that we have already loaded the
8913     // Klass::super_check_offset from the super_klass in the fast path,
8914     // so if there is a null in that register, we are already in the afterlife.
8915     testl(rax,rax); // Set Z = 0
8916     repne_scanl();
8917   } else
8918 #endif // _LP64
8919   {
8920     testptr(rax,rax); // Set Z = 0
8921     repne_scan();
8922   }
8923   // Unspill the temp. registers:
8924   if (pushed_rdi)  pop(rdi);
8925   if (pushed_rcx)  pop(rcx);
8926   if (pushed_rax)  pop(rax);
8927 
8928   if (set_cond_codes) {
8929     // Special hack for the AD files:  rdi is guaranteed non-zero.
8930     assert(!pushed_rdi, "rdi must be left non-NULL");
8931     // Also, the condition codes are properly set Z/NZ on succeed/failure.
8932   }
8933 
8934   if (L_failure == &L_fallthrough)
8935         jccb(Assembler::notEqual, *L_failure);
8936   else  jcc(Assembler::notEqual, *L_failure);
8937 
8938   // Success.  Cache the super we found and proceed in triumph.
8939   movptr(super_cache_addr, super_klass);
8940 
8941   if (L_success != &L_fallthrough) {
8942     jmp(*L_success);
8943   }
8944 
8945 #undef IS_A_TEMP
8946 
8947   bind(L_fallthrough);
8948 }
8949 
8950 
8951 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
8952   if (VM_Version::supports_cmov()) {
8953     cmovl(cc, dst, src);
8954   } else {
8955     Label L;
8956     jccb(negate_condition(cc), L);
8957     movl(dst, src);
8958     bind(L);
8959   }
8960 }
8961 
8962 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
8963   if (VM_Version::supports_cmov()) {
8964     cmovl(cc, dst, src);
8965   } else {
8966     Label L;
8967     jccb(negate_condition(cc), L);
8968     movl(dst, src);
8969     bind(L);
8970   }
8971 }
8972 
8973 void MacroAssembler::verify_oop(Register reg, const char* s) {
8974   if (!VerifyOops) return;
8975 
8976   // Pass register number to verify_oop_subroutine
8977   char* b = new char[strlen(s) + 50];
8978   sprintf(b, "verify_oop: %s: %s", reg->name(), s);
8979 #ifdef _LP64
8980   push(rscratch1);                    // save r10, trashed by movptr()
8981 #endif
8982   push(rax);                          // save rax,
8983   push(reg);                          // pass register argument
8984   ExternalAddress buffer((address) b);
8985   // avoid using pushptr, as it modifies scratch registers
8986   // and our contract is not to modify anything
8987   movptr(rax, buffer.addr());
8988   push(rax);
8989   // call indirectly to solve generation ordering problem
8990   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
8991   call(rax);
8992   // Caller pops the arguments (oop, message) and restores rax, r10
8993 }
8994 
8995 
8996 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
8997                                                       Register tmp,
8998                                                       int offset) {
8999   intptr_t value = *delayed_value_addr;
9000   if (value != 0)
9001     return RegisterOrConstant(value + offset);
9002 
9003   // load indirectly to solve generation ordering problem
9004   movptr(tmp, ExternalAddress((address) delayed_value_addr));
9005 
9006 #ifdef ASSERT
9007   { Label L;
9008     testptr(tmp, tmp);
9009     if (WizardMode) {
9010       jcc(Assembler::notZero, L);
9011       char* buf = new char[40];
9012       sprintf(buf, "DelayedValue="INTPTR_FORMAT, delayed_value_addr[1]);
9013       stop(buf);
9014     } else {
9015       jccb(Assembler::notZero, L);
9016       hlt();
9017     }
9018     bind(L);
9019   }
9020 #endif
9021 
9022   if (offset != 0)
9023     addptr(tmp, offset);
9024 
9025   return RegisterOrConstant(tmp);
9026 }
9027 
9028 
9029 // registers on entry:
9030 //  - rax ('check' register): required MethodType
9031 //  - rcx: method handle
9032 //  - rdx, rsi, or ?: killable temp
9033 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
9034                                               Register temp_reg,
9035                                               Label& wrong_method_type) {
9036   Address type_addr(mh_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg));
9037   // compare method type against that of the receiver
9038   if (UseCompressedOops) {
9039     load_heap_oop(temp_reg, type_addr);
9040     cmpptr(mtype_reg, temp_reg);
9041   } else {
9042     cmpptr(mtype_reg, type_addr);
9043   }
9044   jcc(Assembler::notEqual, wrong_method_type);
9045 }
9046 
9047 
9048 // A method handle has a "vmslots" field which gives the size of its
9049 // argument list in JVM stack slots.  This field is either located directly
9050 // in every method handle, or else is indirectly accessed through the
9051 // method handle's MethodType.  This macro hides the distinction.
9052 void MacroAssembler::load_method_handle_vmslots(Register vmslots_reg, Register mh_reg,
9053                                                 Register temp_reg) {
9054   assert_different_registers(vmslots_reg, mh_reg, temp_reg);
9055   // load mh.type.form.vmslots
9056   Register temp2_reg = vmslots_reg;
9057   load_heap_oop(temp2_reg, Address(mh_reg,    delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg)));
9058   load_heap_oop(temp2_reg, Address(temp2_reg, delayed_value(java_lang_invoke_MethodType::form_offset_in_bytes, temp_reg)));
9059   movl(vmslots_reg, Address(temp2_reg, delayed_value(java_lang_invoke_MethodTypeForm::vmslots_offset_in_bytes, temp_reg)));
9060 }
9061 
9062 
9063 // registers on entry:
9064 //  - rcx: method handle
9065 //  - rdx: killable temp (interpreted only)
9066 //  - rax: killable temp (compiled only)
9067 void MacroAssembler::jump_to_method_handle_entry(Register mh_reg, Register temp_reg) {
9068   assert(mh_reg == rcx, "caller must put MH object in rcx");
9069   assert_different_registers(mh_reg, temp_reg);
9070 
9071   // pick out the interpreted side of the handler
9072   // NOTE: vmentry is not an oop!
9073   movptr(temp_reg, Address(mh_reg, delayed_value(java_lang_invoke_MethodHandle::vmentry_offset_in_bytes, temp_reg)));
9074 
9075   // off we go...
9076   jmp(Address(temp_reg, MethodHandleEntry::from_interpreted_entry_offset_in_bytes()));
9077 
9078   // for the various stubs which take control at this point,
9079   // see MethodHandles::generate_method_handle_stub
9080 }
9081 
9082 
9083 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
9084                                          int extra_slot_offset) {
9085   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
9086   int stackElementSize = Interpreter::stackElementSize;
9087   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
9088 #ifdef ASSERT
9089   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
9090   assert(offset1 - offset == stackElementSize, "correct arithmetic");
9091 #endif
9092   Register             scale_reg    = noreg;
9093   Address::ScaleFactor scale_factor = Address::no_scale;
9094   if (arg_slot.is_constant()) {
9095     offset += arg_slot.as_constant() * stackElementSize;
9096   } else {
9097     scale_reg    = arg_slot.as_register();
9098     scale_factor = Address::times(stackElementSize);
9099   }
9100   offset += wordSize;           // return PC is on stack
9101   return Address(rsp, scale_reg, scale_factor, offset);
9102 }
9103 
9104 
9105 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
9106   if (!VerifyOops) return;
9107 
9108   // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
9109   // Pass register number to verify_oop_subroutine
9110   char* b = new char[strlen(s) + 50];
9111   sprintf(b, "verify_oop_addr: %s", s);
9112 
9113 #ifdef _LP64
9114   push(rscratch1);                    // save r10, trashed by movptr()
9115 #endif
9116   push(rax);                          // save rax,
9117   // addr may contain rsp so we will have to adjust it based on the push
9118   // we just did (and on 64 bit we do two pushes)
9119   // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
9120   // stores rax into addr which is backwards of what was intended.
9121   if (addr.uses(rsp)) {
9122     lea(rax, addr);
9123     pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
9124   } else {
9125     pushptr(addr);
9126   }
9127 
9128   ExternalAddress buffer((address) b);
9129   // pass msg argument
9130   // avoid using pushptr, as it modifies scratch registers
9131   // and our contract is not to modify anything
9132   movptr(rax, buffer.addr());
9133   push(rax);
9134 
9135   // call indirectly to solve generation ordering problem
9136   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
9137   call(rax);
9138   // Caller pops the arguments (addr, message) and restores rax, r10.
9139 }
9140 
9141 void MacroAssembler::verify_tlab() {
9142 #ifdef ASSERT
9143   if (UseTLAB && VerifyOops) {
9144     Label next, ok;
9145     Register t1 = rsi;
9146     Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
9147 
9148     push(t1);
9149     NOT_LP64(push(thread_reg));
9150     NOT_LP64(get_thread(thread_reg));
9151 
9152     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
9153     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
9154     jcc(Assembler::aboveEqual, next);
9155     stop("assert(top >= start)");
9156     should_not_reach_here();
9157 
9158     bind(next);
9159     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
9160     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
9161     jcc(Assembler::aboveEqual, ok);
9162     stop("assert(top <= end)");
9163     should_not_reach_here();
9164 
9165     bind(ok);
9166     NOT_LP64(pop(thread_reg));
9167     pop(t1);
9168   }
9169 #endif
9170 }
9171 
9172 class ControlWord {
9173  public:
9174   int32_t _value;
9175 
9176   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
9177   int  precision_control() const       { return  (_value >>  8) & 3      ; }
9178   bool precision() const               { return ((_value >>  5) & 1) != 0; }
9179   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
9180   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
9181   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
9182   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
9183   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
9184 
9185   void print() const {
9186     // rounding control
9187     const char* rc;
9188     switch (rounding_control()) {
9189       case 0: rc = "round near"; break;
9190       case 1: rc = "round down"; break;
9191       case 2: rc = "round up  "; break;
9192       case 3: rc = "chop      "; break;
9193     };
9194     // precision control
9195     const char* pc;
9196     switch (precision_control()) {
9197       case 0: pc = "24 bits "; break;
9198       case 1: pc = "reserved"; break;
9199       case 2: pc = "53 bits "; break;
9200       case 3: pc = "64 bits "; break;
9201     };
9202     // flags
9203     char f[9];
9204     f[0] = ' ';
9205     f[1] = ' ';
9206     f[2] = (precision   ()) ? 'P' : 'p';
9207     f[3] = (underflow   ()) ? 'U' : 'u';
9208     f[4] = (overflow    ()) ? 'O' : 'o';
9209     f[5] = (zero_divide ()) ? 'Z' : 'z';
9210     f[6] = (denormalized()) ? 'D' : 'd';
9211     f[7] = (invalid     ()) ? 'I' : 'i';
9212     f[8] = '\x0';
9213     // output
9214     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
9215   }
9216 
9217 };
9218 
9219 class StatusWord {
9220  public:
9221   int32_t _value;
9222 
9223   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
9224   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
9225   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
9226   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
9227   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
9228   int  top() const                     { return  (_value >> 11) & 7      ; }
9229   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
9230   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
9231   bool precision() const               { return ((_value >>  5) & 1) != 0; }
9232   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
9233   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
9234   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
9235   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
9236   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
9237 
9238   void print() const {
9239     // condition codes
9240     char c[5];
9241     c[0] = (C3()) ? '3' : '-';
9242     c[1] = (C2()) ? '2' : '-';
9243     c[2] = (C1()) ? '1' : '-';
9244     c[3] = (C0()) ? '0' : '-';
9245     c[4] = '\x0';
9246     // flags
9247     char f[9];
9248     f[0] = (error_status()) ? 'E' : '-';
9249     f[1] = (stack_fault ()) ? 'S' : '-';
9250     f[2] = (precision   ()) ? 'P' : '-';
9251     f[3] = (underflow   ()) ? 'U' : '-';
9252     f[4] = (overflow    ()) ? 'O' : '-';
9253     f[5] = (zero_divide ()) ? 'Z' : '-';
9254     f[6] = (denormalized()) ? 'D' : '-';
9255     f[7] = (invalid     ()) ? 'I' : '-';
9256     f[8] = '\x0';
9257     // output
9258     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
9259   }
9260 
9261 };
9262 
9263 class TagWord {
9264  public:
9265   int32_t _value;
9266 
9267   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
9268 
9269   void print() const {
9270     printf("%04x", _value & 0xFFFF);
9271   }
9272 
9273 };
9274 
9275 class FPU_Register {
9276  public:
9277   int32_t _m0;
9278   int32_t _m1;
9279   int16_t _ex;
9280 
9281   bool is_indefinite() const           {
9282     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
9283   }
9284 
9285   void print() const {
9286     char  sign = (_ex < 0) ? '-' : '+';
9287     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
9288     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
9289   };
9290 
9291 };
9292 
9293 class FPU_State {
9294  public:
9295   enum {
9296     register_size       = 10,
9297     number_of_registers =  8,
9298     register_mask       =  7
9299   };
9300 
9301   ControlWord  _control_word;
9302   StatusWord   _status_word;
9303   TagWord      _tag_word;
9304   int32_t      _error_offset;
9305   int32_t      _error_selector;
9306   int32_t      _data_offset;
9307   int32_t      _data_selector;
9308   int8_t       _register[register_size * number_of_registers];
9309 
9310   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
9311   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
9312 
9313   const char* tag_as_string(int tag) const {
9314     switch (tag) {
9315       case 0: return "valid";
9316       case 1: return "zero";
9317       case 2: return "special";
9318       case 3: return "empty";
9319     }
9320     ShouldNotReachHere();
9321     return NULL;
9322   }
9323 
9324   void print() const {
9325     // print computation registers
9326     { int t = _status_word.top();
9327       for (int i = 0; i < number_of_registers; i++) {
9328         int j = (i - t) & register_mask;
9329         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
9330         st(j)->print();
9331         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
9332       }
9333     }
9334     printf("\n");
9335     // print control registers
9336     printf("ctrl = "); _control_word.print(); printf("\n");
9337     printf("stat = "); _status_word .print(); printf("\n");
9338     printf("tags = "); _tag_word    .print(); printf("\n");
9339   }
9340 
9341 };
9342 
9343 class Flag_Register {
9344  public:
9345   int32_t _value;
9346 
9347   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
9348   bool direction() const               { return ((_value >> 10) & 1) != 0; }
9349   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
9350   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
9351   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
9352   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
9353   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
9354 
9355   void print() const {
9356     // flags
9357     char f[8];
9358     f[0] = (overflow       ()) ? 'O' : '-';
9359     f[1] = (direction      ()) ? 'D' : '-';
9360     f[2] = (sign           ()) ? 'S' : '-';
9361     f[3] = (zero           ()) ? 'Z' : '-';
9362     f[4] = (auxiliary_carry()) ? 'A' : '-';
9363     f[5] = (parity         ()) ? 'P' : '-';
9364     f[6] = (carry          ()) ? 'C' : '-';
9365     f[7] = '\x0';
9366     // output
9367     printf("%08x  flags = %s", _value, f);
9368   }
9369 
9370 };
9371 
9372 class IU_Register {
9373  public:
9374   int32_t _value;
9375 
9376   void print() const {
9377     printf("%08x  %11d", _value, _value);
9378   }
9379 
9380 };
9381 
9382 class IU_State {
9383  public:
9384   Flag_Register _eflags;
9385   IU_Register   _rdi;
9386   IU_Register   _rsi;
9387   IU_Register   _rbp;
9388   IU_Register   _rsp;
9389   IU_Register   _rbx;
9390   IU_Register   _rdx;
9391   IU_Register   _rcx;
9392   IU_Register   _rax;
9393 
9394   void print() const {
9395     // computation registers
9396     printf("rax,  = "); _rax.print(); printf("\n");
9397     printf("rbx,  = "); _rbx.print(); printf("\n");
9398     printf("rcx  = "); _rcx.print(); printf("\n");
9399     printf("rdx  = "); _rdx.print(); printf("\n");
9400     printf("rdi  = "); _rdi.print(); printf("\n");
9401     printf("rsi  = "); _rsi.print(); printf("\n");
9402     printf("rbp,  = "); _rbp.print(); printf("\n");
9403     printf("rsp  = "); _rsp.print(); printf("\n");
9404     printf("\n");
9405     // control registers
9406     printf("flgs = "); _eflags.print(); printf("\n");
9407   }
9408 };
9409 
9410 
9411 class CPU_State {
9412  public:
9413   FPU_State _fpu_state;
9414   IU_State  _iu_state;
9415 
9416   void print() const {
9417     printf("--------------------------------------------------\n");
9418     _iu_state .print();
9419     printf("\n");
9420     _fpu_state.print();
9421     printf("--------------------------------------------------\n");
9422   }
9423 
9424 };
9425 
9426 
9427 static void _print_CPU_state(CPU_State* state) {
9428   state->print();
9429 };
9430 
9431 
9432 void MacroAssembler::print_CPU_state() {
9433   push_CPU_state();
9434   push(rsp);                // pass CPU state
9435   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
9436   addptr(rsp, wordSize);       // discard argument
9437   pop_CPU_state();
9438 }
9439 
9440 
9441 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
9442   static int counter = 0;
9443   FPU_State* fs = &state->_fpu_state;
9444   counter++;
9445   // For leaf calls, only verify that the top few elements remain empty.
9446   // We only need 1 empty at the top for C2 code.
9447   if( stack_depth < 0 ) {
9448     if( fs->tag_for_st(7) != 3 ) {
9449       printf("FPR7 not empty\n");
9450       state->print();
9451       assert(false, "error");
9452       return false;
9453     }
9454     return true;                // All other stack states do not matter
9455   }
9456 
9457   assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
9458          "bad FPU control word");
9459 
9460   // compute stack depth
9461   int i = 0;
9462   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
9463   int d = i;
9464   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
9465   // verify findings
9466   if (i != FPU_State::number_of_registers) {
9467     // stack not contiguous
9468     printf("%s: stack not contiguous at ST%d\n", s, i);
9469     state->print();
9470     assert(false, "error");
9471     return false;
9472   }
9473   // check if computed stack depth corresponds to expected stack depth
9474   if (stack_depth < 0) {
9475     // expected stack depth is -stack_depth or less
9476     if (d > -stack_depth) {
9477       // too many elements on the stack
9478       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
9479       state->print();
9480       assert(false, "error");
9481       return false;
9482     }
9483   } else {
9484     // expected stack depth is stack_depth
9485     if (d != stack_depth) {
9486       // wrong stack depth
9487       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
9488       state->print();
9489       assert(false, "error");
9490       return false;
9491     }
9492   }
9493   // everything is cool
9494   return true;
9495 }
9496 
9497 
9498 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
9499   if (!VerifyFPU) return;
9500   push_CPU_state();
9501   push(rsp);                // pass CPU state
9502   ExternalAddress msg((address) s);
9503   // pass message string s
9504   pushptr(msg.addr());
9505   push(stack_depth);        // pass stack depth
9506   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
9507   addptr(rsp, 3 * wordSize);   // discard arguments
9508   // check for error
9509   { Label L;
9510     testl(rax, rax);
9511     jcc(Assembler::notZero, L);
9512     int3();                  // break if error condition
9513     bind(L);
9514   }
9515   pop_CPU_state();
9516 }
9517 
9518 void MacroAssembler::load_klass(Register dst, Register src) {
9519 #ifdef _LP64
9520   if (UseCompressedOops) {
9521     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
9522     decode_heap_oop_not_null(dst);
9523   } else
9524 #endif
9525     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
9526 }
9527 
9528 void MacroAssembler::load_prototype_header(Register dst, Register src) {
9529 #ifdef _LP64
9530   if (UseCompressedOops) {
9531     assert (Universe::heap() != NULL, "java heap should be initialized");
9532     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
9533     if (Universe::narrow_oop_shift() != 0) {
9534       assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
9535       if (LogMinObjAlignmentInBytes == Address::times_8) {
9536         movq(dst, Address(r12_heapbase, dst, Address::times_8, Klass::prototype_header_offset()));
9537       } else {
9538         // OK to use shift since we don't need to preserve flags.
9539         shlq(dst, LogMinObjAlignmentInBytes);
9540         movq(dst, Address(r12_heapbase, dst, Address::times_1, Klass::prototype_header_offset()));
9541       }
9542     } else {
9543       movq(dst, Address(dst, Klass::prototype_header_offset()));
9544     }
9545   } else
9546 #endif
9547   {
9548     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
9549     movptr(dst, Address(dst, Klass::prototype_header_offset()));
9550   }
9551 }
9552 
9553 void MacroAssembler::store_klass(Register dst, Register src) {
9554 #ifdef _LP64
9555   if (UseCompressedOops) {
9556     encode_heap_oop_not_null(src);
9557     movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
9558   } else
9559 #endif
9560     movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
9561 }
9562 
9563 void MacroAssembler::load_heap_oop(Register dst, Address src) {
9564 #ifdef _LP64
9565   if (UseCompressedOops) {
9566     movl(dst, src);
9567     decode_heap_oop(dst);
9568   } else
9569 #endif
9570     movptr(dst, src);
9571 }
9572 
9573 // Doesn't do verfication, generates fixed size code
9574 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src) {
9575 #ifdef _LP64
9576   if (UseCompressedOops) {
9577     movl(dst, src);
9578     decode_heap_oop_not_null(dst);
9579   } else
9580 #endif
9581     movptr(dst, src);
9582 }
9583 
9584 void MacroAssembler::store_heap_oop(Address dst, Register src) {
9585 #ifdef _LP64
9586   if (UseCompressedOops) {
9587     assert(!dst.uses(src), "not enough registers");
9588     encode_heap_oop(src);
9589     movl(dst, src);
9590   } else
9591 #endif
9592     movptr(dst, src);
9593 }
9594 
9595 // Used for storing NULLs.
9596 void MacroAssembler::store_heap_oop_null(Address dst) {
9597 #ifdef _LP64
9598   if (UseCompressedOops) {
9599     movl(dst, (int32_t)NULL_WORD);
9600   } else {
9601     movslq(dst, (int32_t)NULL_WORD);
9602   }
9603 #else
9604   movl(dst, (int32_t)NULL_WORD);
9605 #endif
9606 }
9607 
9608 #ifdef _LP64
9609 void MacroAssembler::store_klass_gap(Register dst, Register src) {
9610   if (UseCompressedOops) {
9611     // Store to klass gap in destination
9612     movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
9613   }
9614 }
9615 
9616 #ifdef ASSERT
9617 void MacroAssembler::verify_heapbase(const char* msg) {
9618   assert (UseCompressedOops, "should be compressed");
9619   assert (Universe::heap() != NULL, "java heap should be initialized");
9620   if (CheckCompressedOops) {
9621     Label ok;
9622     push(rscratch1); // cmpptr trashes rscratch1
9623     cmpptr(r12_heapbase, ExternalAddress((address)Universe::narrow_oop_base_addr()));
9624     jcc(Assembler::equal, ok);
9625     stop(msg);
9626     bind(ok);
9627     pop(rscratch1);
9628   }
9629 }
9630 #endif
9631 
9632 // Algorithm must match oop.inline.hpp encode_heap_oop.
9633 void MacroAssembler::encode_heap_oop(Register r) {
9634 #ifdef ASSERT
9635   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
9636 #endif
9637   verify_oop(r, "broken oop in encode_heap_oop");
9638   if (Universe::narrow_oop_base() == NULL) {
9639     if (Universe::narrow_oop_shift() != 0) {
9640       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
9641       shrq(r, LogMinObjAlignmentInBytes);
9642     }
9643     return;
9644   }
9645   testq(r, r);
9646   cmovq(Assembler::equal, r, r12_heapbase);
9647   subq(r, r12_heapbase);
9648   shrq(r, LogMinObjAlignmentInBytes);
9649 }
9650 
9651 void MacroAssembler::encode_heap_oop_not_null(Register r) {
9652 #ifdef ASSERT
9653   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
9654   if (CheckCompressedOops) {
9655     Label ok;
9656     testq(r, r);
9657     jcc(Assembler::notEqual, ok);
9658     stop("null oop passed to encode_heap_oop_not_null");
9659     bind(ok);
9660   }
9661 #endif
9662   verify_oop(r, "broken oop in encode_heap_oop_not_null");
9663   if (Universe::narrow_oop_base() != NULL) {
9664     subq(r, r12_heapbase);
9665   }
9666   if (Universe::narrow_oop_shift() != 0) {
9667     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
9668     shrq(r, LogMinObjAlignmentInBytes);
9669   }
9670 }
9671 
9672 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
9673 #ifdef ASSERT
9674   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
9675   if (CheckCompressedOops) {
9676     Label ok;
9677     testq(src, src);
9678     jcc(Assembler::notEqual, ok);
9679     stop("null oop passed to encode_heap_oop_not_null2");
9680     bind(ok);
9681   }
9682 #endif
9683   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
9684   if (dst != src) {
9685     movq(dst, src);
9686   }
9687   if (Universe::narrow_oop_base() != NULL) {
9688     subq(dst, r12_heapbase);
9689   }
9690   if (Universe::narrow_oop_shift() != 0) {
9691     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
9692     shrq(dst, LogMinObjAlignmentInBytes);
9693   }
9694 }
9695 
9696 void  MacroAssembler::decode_heap_oop(Register r) {
9697 #ifdef ASSERT
9698   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
9699 #endif
9700   if (Universe::narrow_oop_base() == NULL) {
9701     if (Universe::narrow_oop_shift() != 0) {
9702       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
9703       shlq(r, LogMinObjAlignmentInBytes);
9704     }
9705   } else {
9706     Label done;
9707     shlq(r, LogMinObjAlignmentInBytes);
9708     jccb(Assembler::equal, done);
9709     addq(r, r12_heapbase);
9710     bind(done);
9711   }
9712   verify_oop(r, "broken oop in decode_heap_oop");
9713 }
9714 
9715 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
9716   // Note: it will change flags
9717   assert (UseCompressedOops, "should only be used for compressed headers");
9718   assert (Universe::heap() != NULL, "java heap should be initialized");
9719   // Cannot assert, unverified entry point counts instructions (see .ad file)
9720   // vtableStubs also counts instructions in pd_code_size_limit.
9721   // Also do not verify_oop as this is called by verify_oop.
9722   if (Universe::narrow_oop_shift() != 0) {
9723     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
9724     shlq(r, LogMinObjAlignmentInBytes);
9725     if (Universe::narrow_oop_base() != NULL) {
9726       addq(r, r12_heapbase);
9727     }
9728   } else {
9729     assert (Universe::narrow_oop_base() == NULL, "sanity");
9730   }
9731 }
9732 
9733 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
9734   // Note: it will change flags
9735   assert (UseCompressedOops, "should only be used for compressed headers");
9736   assert (Universe::heap() != NULL, "java heap should be initialized");
9737   // Cannot assert, unverified entry point counts instructions (see .ad file)
9738   // vtableStubs also counts instructions in pd_code_size_limit.
9739   // Also do not verify_oop as this is called by verify_oop.
9740   if (Universe::narrow_oop_shift() != 0) {
9741     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
9742     if (LogMinObjAlignmentInBytes == Address::times_8) {
9743       leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
9744     } else {
9745       if (dst != src) {
9746         movq(dst, src);
9747       }
9748       shlq(dst, LogMinObjAlignmentInBytes);
9749       if (Universe::narrow_oop_base() != NULL) {
9750         addq(dst, r12_heapbase);
9751       }
9752     }
9753   } else {
9754     assert (Universe::narrow_oop_base() == NULL, "sanity");
9755     if (dst != src) {
9756       movq(dst, src);
9757     }
9758   }
9759 }
9760 
9761 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
9762   assert (UseCompressedOops, "should only be used for compressed headers");
9763   assert (Universe::heap() != NULL, "java heap should be initialized");
9764   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
9765   int oop_index = oop_recorder()->find_index(obj);
9766   RelocationHolder rspec = oop_Relocation::spec(oop_index);
9767   mov_narrow_oop(dst, oop_index, rspec);
9768 }
9769 
9770 void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
9771   assert (UseCompressedOops, "should only be used for compressed headers");
9772   assert (Universe::heap() != NULL, "java heap should be initialized");
9773   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
9774   int oop_index = oop_recorder()->find_index(obj);
9775   RelocationHolder rspec = oop_Relocation::spec(oop_index);
9776   mov_narrow_oop(dst, oop_index, rspec);
9777 }
9778 
9779 void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
9780   assert (UseCompressedOops, "should only be used for compressed headers");
9781   assert (Universe::heap() != NULL, "java heap should be initialized");
9782   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
9783   int oop_index = oop_recorder()->find_index(obj);
9784   RelocationHolder rspec = oop_Relocation::spec(oop_index);
9785   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
9786 }
9787 
9788 void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
9789   assert (UseCompressedOops, "should only be used for compressed headers");
9790   assert (Universe::heap() != NULL, "java heap should be initialized");
9791   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
9792   int oop_index = oop_recorder()->find_index(obj);
9793   RelocationHolder rspec = oop_Relocation::spec(oop_index);
9794   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
9795 }
9796 
9797 void MacroAssembler::reinit_heapbase() {
9798   if (UseCompressedOops) {
9799     movptr(r12_heapbase, ExternalAddress((address)Universe::narrow_oop_base_addr()));
9800   }
9801 }
9802 #endif // _LP64
9803 
9804 
9805 // C2 compiled method's prolog code.
9806 void MacroAssembler::verified_entry(int framesize, bool stack_bang, bool fp_mode_24b) {
9807 
9808   // WARNING: Initial instruction MUST be 5 bytes or longer so that
9809   // NativeJump::patch_verified_entry will be able to patch out the entry
9810   // code safely. The push to verify stack depth is ok at 5 bytes,
9811   // the frame allocation can be either 3 or 6 bytes. So if we don't do
9812   // stack bang then we must use the 6 byte frame allocation even if
9813   // we have no frame. :-(
9814 
9815   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
9816   // Remove word for return addr
9817   framesize -= wordSize;
9818 
9819   // Calls to C2R adapters often do not accept exceptional returns.
9820   // We require that their callers must bang for them.  But be careful, because
9821   // some VM calls (such as call site linkage) can use several kilobytes of
9822   // stack.  But the stack safety zone should account for that.
9823   // See bugs 4446381, 4468289, 4497237.
9824   if (stack_bang) {
9825     generate_stack_overflow_check(framesize);
9826 
9827     // We always push rbp, so that on return to interpreter rbp, will be
9828     // restored correctly and we can correct the stack.
9829     push(rbp);
9830     // Remove word for ebp
9831     framesize -= wordSize;
9832 
9833     // Create frame
9834     if (framesize) {
9835       subptr(rsp, framesize);
9836     }
9837   } else {
9838     // Create frame (force generation of a 4 byte immediate value)
9839     subptr_imm32(rsp, framesize);
9840 
9841     // Save RBP register now.
9842     framesize -= wordSize;
9843     movptr(Address(rsp, framesize), rbp);
9844   }
9845 
9846   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
9847     framesize -= wordSize;
9848     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
9849   }
9850 
9851 #ifndef _LP64
9852   // If method sets FPU control word do it now
9853   if (fp_mode_24b) {
9854     fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
9855   }
9856   if (UseSSE >= 2 && VerifyFPU) {
9857     verify_FPU(0, "FPU stack must be clean on entry");
9858   }
9859 #endif
9860 
9861 #ifdef ASSERT
9862   if (VerifyStackAtCalls) {
9863     Label L;
9864     push(rax);
9865     mov(rax, rsp);
9866     andptr(rax, StackAlignmentInBytes-1);
9867     cmpptr(rax, StackAlignmentInBytes-wordSize);
9868     pop(rax);
9869     jcc(Assembler::equal, L);
9870     stop("Stack is not properly aligned!");
9871     bind(L);
9872   }
9873 #endif
9874 
9875 }
9876 
9877 
9878 // IndexOf for constant substrings with size >= 8 chars
9879 // which don't need to be loaded through stack.
9880 void MacroAssembler::string_indexofC8(Register str1, Register str2,
9881                                       Register cnt1, Register cnt2,
9882                                       int int_cnt2,  Register result,
9883                                       XMMRegister vec, Register tmp) {
9884   ShortBranchVerifier sbv(this);
9885   assert(UseSSE42Intrinsics, "SSE4.2 is required");
9886 
9887   // This method uses pcmpestri inxtruction with bound registers
9888   //   inputs:
9889   //     xmm - substring
9890   //     rax - substring length (elements count)
9891   //     mem - scanned string
9892   //     rdx - string length (elements count)
9893   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
9894   //   outputs:
9895   //     rcx - matched index in string
9896   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
9897 
9898   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
9899         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
9900         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
9901 
9902   // Note, inline_string_indexOf() generates checks:
9903   // if (substr.count > string.count) return -1;
9904   // if (substr.count == 0) return 0;
9905   assert(int_cnt2 >= 8, "this code isused only for cnt2 >= 8 chars");
9906 
9907   // Load substring.
9908   movdqu(vec, Address(str2, 0));
9909   movl(cnt2, int_cnt2);
9910   movptr(result, str1); // string addr
9911 
9912   if (int_cnt2 > 8) {
9913     jmpb(SCAN_TO_SUBSTR);
9914 
9915     // Reload substr for rescan, this code
9916     // is executed only for large substrings (> 8 chars)
9917     bind(RELOAD_SUBSTR);
9918     movdqu(vec, Address(str2, 0));
9919     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
9920 
9921     bind(RELOAD_STR);
9922     // We came here after the beginning of the substring was
9923     // matched but the rest of it was not so we need to search
9924     // again. Start from the next element after the previous match.
9925 
9926     // cnt2 is number of substring reminding elements and
9927     // cnt1 is number of string reminding elements when cmp failed.
9928     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
9929     subl(cnt1, cnt2);
9930     addl(cnt1, int_cnt2);
9931     movl(cnt2, int_cnt2); // Now restore cnt2
9932 
9933     decrementl(cnt1);     // Shift to next element
9934     cmpl(cnt1, cnt2);
9935     jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
9936 
9937     addptr(result, 2);
9938 
9939   } // (int_cnt2 > 8)
9940 
9941   // Scan string for start of substr in 16-byte vectors
9942   bind(SCAN_TO_SUBSTR);
9943   pcmpestri(vec, Address(result, 0), 0x0d);
9944   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
9945   subl(cnt1, 8);
9946   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
9947   cmpl(cnt1, cnt2);
9948   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
9949   addptr(result, 16);
9950   jmpb(SCAN_TO_SUBSTR);
9951 
9952   // Found a potential substr
9953   bind(FOUND_CANDIDATE);
9954   // Matched whole vector if first element matched (tmp(rcx) == 0).
9955   if (int_cnt2 == 8) {
9956     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
9957   } else { // int_cnt2 > 8
9958     jccb(Assembler::overflow, FOUND_SUBSTR);
9959   }
9960   // After pcmpestri tmp(rcx) contains matched element index
9961   // Compute start addr of substr
9962   lea(result, Address(result, tmp, Address::times_2));
9963 
9964   // Make sure string is still long enough
9965   subl(cnt1, tmp);
9966   cmpl(cnt1, cnt2);
9967   if (int_cnt2 == 8) {
9968     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
9969   } else { // int_cnt2 > 8
9970     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
9971   }
9972   // Left less then substring.
9973 
9974   bind(RET_NOT_FOUND);
9975   movl(result, -1);
9976   jmpb(EXIT);
9977 
9978   if (int_cnt2 > 8) {
9979     // This code is optimized for the case when whole substring
9980     // is matched if its head is matched.
9981     bind(MATCH_SUBSTR_HEAD);
9982     pcmpestri(vec, Address(result, 0), 0x0d);
9983     // Reload only string if does not match
9984     jccb(Assembler::noOverflow, RELOAD_STR); // OF == 0
9985 
9986     Label CONT_SCAN_SUBSTR;
9987     // Compare the rest of substring (> 8 chars).
9988     bind(FOUND_SUBSTR);
9989     // First 8 chars are already matched.
9990     negptr(cnt2);
9991     addptr(cnt2, 8);
9992 
9993     bind(SCAN_SUBSTR);
9994     subl(cnt1, 8);
9995     cmpl(cnt2, -8); // Do not read beyond substring
9996     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
9997     // Back-up strings to avoid reading beyond substring:
9998     // cnt1 = cnt1 - cnt2 + 8
9999     addl(cnt1, cnt2); // cnt2 is negative
10000     addl(cnt1, 8);
10001     movl(cnt2, 8); negptr(cnt2);
10002     bind(CONT_SCAN_SUBSTR);
10003     if (int_cnt2 < (int)G) {
10004       movdqu(vec, Address(str2, cnt2, Address::times_2, int_cnt2*2));
10005       pcmpestri(vec, Address(result, cnt2, Address::times_2, int_cnt2*2), 0x0d);
10006     } else {
10007       // calculate index in register to avoid integer overflow (int_cnt2*2)
10008       movl(tmp, int_cnt2);
10009       addptr(tmp, cnt2);
10010       movdqu(vec, Address(str2, tmp, Address::times_2, 0));
10011       pcmpestri(vec, Address(result, tmp, Address::times_2, 0), 0x0d);
10012     }
10013     // Need to reload strings pointers if not matched whole vector
10014     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
10015     addptr(cnt2, 8);
10016     jcc(Assembler::negative, SCAN_SUBSTR);
10017     // Fall through if found full substring
10018 
10019   } // (int_cnt2 > 8)
10020 
10021   bind(RET_FOUND);
10022   // Found result if we matched full small substring.
10023   // Compute substr offset
10024   subptr(result, str1);
10025   shrl(result, 1); // index
10026   bind(EXIT);
10027 
10028 } // string_indexofC8
10029 
10030 // Small strings are loaded through stack if they cross page boundary.
10031 void MacroAssembler::string_indexof(Register str1, Register str2,
10032                                     Register cnt1, Register cnt2,
10033                                     int int_cnt2,  Register result,
10034                                     XMMRegister vec, Register tmp) {
10035   ShortBranchVerifier sbv(this);
10036   assert(UseSSE42Intrinsics, "SSE4.2 is required");
10037   //
10038   // int_cnt2 is length of small (< 8 chars) constant substring
10039   // or (-1) for non constant substring in which case its length
10040   // is in cnt2 register.
10041   //
10042   // Note, inline_string_indexOf() generates checks:
10043   // if (substr.count > string.count) return -1;
10044   // if (substr.count == 0) return 0;
10045   //
10046   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < 8), "should be != 0");
10047 
10048   // This method uses pcmpestri inxtruction with bound registers
10049   //   inputs:
10050   //     xmm - substring
10051   //     rax - substring length (elements count)
10052   //     mem - scanned string
10053   //     rdx - string length (elements count)
10054   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
10055   //   outputs:
10056   //     rcx - matched index in string
10057   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
10058 
10059   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
10060         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
10061         FOUND_CANDIDATE;
10062 
10063   { //========================================================
10064     // We don't know where these strings are located
10065     // and we can't read beyond them. Load them through stack.
10066     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
10067 
10068     movptr(tmp, rsp); // save old SP
10069 
10070     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
10071       if (int_cnt2 == 1) {  // One char
10072         load_unsigned_short(result, Address(str2, 0));
10073         movdl(vec, result); // move 32 bits
10074       } else if (int_cnt2 == 2) { // Two chars
10075         movdl(vec, Address(str2, 0)); // move 32 bits
10076       } else if (int_cnt2 == 4) { // Four chars
10077         movq(vec, Address(str2, 0));  // move 64 bits
10078       } else { // cnt2 = { 3, 5, 6, 7 }
10079         // Array header size is 12 bytes in 32-bit VM
10080         // + 6 bytes for 3 chars == 18 bytes,
10081         // enough space to load vec and shift.
10082         assert(HeapWordSize*typeArrayKlass::header_size() >= 12,"sanity");
10083         movdqu(vec, Address(str2, (int_cnt2*2)-16));
10084         psrldq(vec, 16-(int_cnt2*2));
10085       }
10086     } else { // not constant substring
10087       cmpl(cnt2, 8);
10088       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
10089 
10090       // We can read beyond string if srt+16 does not cross page boundary
10091       // since heaps are aligned and mapped by pages.
10092       assert(os::vm_page_size() < (int)G, "default page should be small");
10093       movl(result, str2); // We need only low 32 bits
10094       andl(result, (os::vm_page_size()-1));
10095       cmpl(result, (os::vm_page_size()-16));
10096       jccb(Assembler::belowEqual, CHECK_STR);
10097 
10098       // Move small strings to stack to allow load 16 bytes into vec.
10099       subptr(rsp, 16);
10100       int stk_offset = wordSize-2;
10101       push(cnt2);
10102 
10103       bind(COPY_SUBSTR);
10104       load_unsigned_short(result, Address(str2, cnt2, Address::times_2, -2));
10105       movw(Address(rsp, cnt2, Address::times_2, stk_offset), result);
10106       decrement(cnt2);
10107       jccb(Assembler::notZero, COPY_SUBSTR);
10108 
10109       pop(cnt2);
10110       movptr(str2, rsp);  // New substring address
10111     } // non constant
10112 
10113     bind(CHECK_STR);
10114     cmpl(cnt1, 8);
10115     jccb(Assembler::aboveEqual, BIG_STRINGS);
10116 
10117     // Check cross page boundary.
10118     movl(result, str1); // We need only low 32 bits
10119     andl(result, (os::vm_page_size()-1));
10120     cmpl(result, (os::vm_page_size()-16));
10121     jccb(Assembler::belowEqual, BIG_STRINGS);
10122 
10123     subptr(rsp, 16);
10124     int stk_offset = -2;
10125     if (int_cnt2 < 0) { // not constant
10126       push(cnt2);
10127       stk_offset += wordSize;
10128     }
10129     movl(cnt2, cnt1);
10130 
10131     bind(COPY_STR);
10132     load_unsigned_short(result, Address(str1, cnt2, Address::times_2, -2));
10133     movw(Address(rsp, cnt2, Address::times_2, stk_offset), result);
10134     decrement(cnt2);
10135     jccb(Assembler::notZero, COPY_STR);
10136 
10137     if (int_cnt2 < 0) { // not constant
10138       pop(cnt2);
10139     }
10140     movptr(str1, rsp);  // New string address
10141 
10142     bind(BIG_STRINGS);
10143     // Load substring.
10144     if (int_cnt2 < 0) { // -1
10145       movdqu(vec, Address(str2, 0));
10146       push(cnt2);       // substr count
10147       push(str2);       // substr addr
10148       push(str1);       // string addr
10149     } else {
10150       // Small (< 8 chars) constant substrings are loaded already.
10151       movl(cnt2, int_cnt2);
10152     }
10153     push(tmp);  // original SP
10154 
10155   } // Finished loading
10156 
10157   //========================================================
10158   // Start search
10159   //
10160 
10161   movptr(result, str1); // string addr
10162 
10163   if (int_cnt2  < 0) {  // Only for non constant substring
10164     jmpb(SCAN_TO_SUBSTR);
10165 
10166     // SP saved at sp+0
10167     // String saved at sp+1*wordSize
10168     // Substr saved at sp+2*wordSize
10169     // Substr count saved at sp+3*wordSize
10170 
10171     // Reload substr for rescan, this code
10172     // is executed only for large substrings (> 8 chars)
10173     bind(RELOAD_SUBSTR);
10174     movptr(str2, Address(rsp, 2*wordSize));
10175     movl(cnt2, Address(rsp, 3*wordSize));
10176     movdqu(vec, Address(str2, 0));
10177     // We came here after the beginning of the substring was
10178     // matched but the rest of it was not so we need to search
10179     // again. Start from the next element after the previous match.
10180     subptr(str1, result); // Restore counter
10181     shrl(str1, 1);
10182     addl(cnt1, str1);
10183     decrementl(cnt1);   // Shift to next element
10184     cmpl(cnt1, cnt2);
10185     jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
10186 
10187     addptr(result, 2);
10188   } // non constant
10189 
10190   // Scan string for start of substr in 16-byte vectors
10191   bind(SCAN_TO_SUBSTR);
10192   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
10193   pcmpestri(vec, Address(result, 0), 0x0d);
10194   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
10195   subl(cnt1, 8);
10196   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
10197   cmpl(cnt1, cnt2);
10198   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
10199   addptr(result, 16);
10200 
10201   bind(ADJUST_STR);
10202   cmpl(cnt1, 8); // Do not read beyond string
10203   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
10204   // Back-up string to avoid reading beyond string.
10205   lea(result, Address(result, cnt1, Address::times_2, -16));
10206   movl(cnt1, 8);
10207   jmpb(SCAN_TO_SUBSTR);
10208 
10209   // Found a potential substr
10210   bind(FOUND_CANDIDATE);
10211   // After pcmpestri tmp(rcx) contains matched element index
10212 
10213   // Make sure string is still long enough
10214   subl(cnt1, tmp);
10215   cmpl(cnt1, cnt2);
10216   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
10217   // Left less then substring.
10218 
10219   bind(RET_NOT_FOUND);
10220   movl(result, -1);
10221   jmpb(CLEANUP);
10222 
10223   bind(FOUND_SUBSTR);
10224   // Compute start addr of substr
10225   lea(result, Address(result, tmp, Address::times_2));
10226 
10227   if (int_cnt2 > 0) { // Constant substring
10228     // Repeat search for small substring (< 8 chars)
10229     // from new point without reloading substring.
10230     // Have to check that we don't read beyond string.
10231     cmpl(tmp, 8-int_cnt2);
10232     jccb(Assembler::greater, ADJUST_STR);
10233     // Fall through if matched whole substring.
10234   } else { // non constant
10235     assert(int_cnt2 == -1, "should be != 0");
10236 
10237     addl(tmp, cnt2);
10238     // Found result if we matched whole substring.
10239     cmpl(tmp, 8);
10240     jccb(Assembler::lessEqual, RET_FOUND);
10241 
10242     // Repeat search for small substring (<= 8 chars)
10243     // from new point 'str1' without reloading substring.
10244     cmpl(cnt2, 8);
10245     // Have to check that we don't read beyond string.
10246     jccb(Assembler::lessEqual, ADJUST_STR);
10247 
10248     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
10249     // Compare the rest of substring (> 8 chars).
10250     movptr(str1, result);
10251 
10252     cmpl(tmp, cnt2);
10253     // First 8 chars are already matched.
10254     jccb(Assembler::equal, CHECK_NEXT);
10255 
10256     bind(SCAN_SUBSTR);
10257     pcmpestri(vec, Address(str1, 0), 0x0d);
10258     // Need to reload strings pointers if not matched whole vector
10259     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
10260 
10261     bind(CHECK_NEXT);
10262     subl(cnt2, 8);
10263     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
10264     addptr(str1, 16);
10265     addptr(str2, 16);
10266     subl(cnt1, 8);
10267     cmpl(cnt2, 8); // Do not read beyond substring
10268     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
10269     // Back-up strings to avoid reading beyond substring.
10270     lea(str2, Address(str2, cnt2, Address::times_2, -16));
10271     lea(str1, Address(str1, cnt2, Address::times_2, -16));
10272     subl(cnt1, cnt2);
10273     movl(cnt2, 8);
10274     addl(cnt1, 8);
10275     bind(CONT_SCAN_SUBSTR);
10276     movdqu(vec, Address(str2, 0));
10277     jmpb(SCAN_SUBSTR);
10278 
10279     bind(RET_FOUND_LONG);
10280     movptr(str1, Address(rsp, wordSize));
10281   } // non constant
10282 
10283   bind(RET_FOUND);
10284   // Compute substr offset
10285   subptr(result, str1);
10286   shrl(result, 1); // index
10287 
10288   bind(CLEANUP);
10289   pop(rsp); // restore SP
10290 
10291 } // string_indexof
10292 
10293 // Compare strings.
10294 void MacroAssembler::string_compare(Register str1, Register str2,
10295                                     Register cnt1, Register cnt2, Register result,
10296                                     XMMRegister vec1) {
10297   ShortBranchVerifier sbv(this);
10298   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
10299 
10300   // Compute the minimum of the string lengths and the
10301   // difference of the string lengths (stack).
10302   // Do the conditional move stuff
10303   movl(result, cnt1);
10304   subl(cnt1, cnt2);
10305   push(cnt1);
10306   cmov32(Assembler::lessEqual, cnt2, result);
10307 
10308   // Is the minimum length zero?
10309   testl(cnt2, cnt2);
10310   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
10311 
10312   // Load first characters
10313   load_unsigned_short(result, Address(str1, 0));
10314   load_unsigned_short(cnt1, Address(str2, 0));
10315 
10316   // Compare first characters
10317   subl(result, cnt1);
10318   jcc(Assembler::notZero,  POP_LABEL);
10319   decrementl(cnt2);
10320   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
10321 
10322   {
10323     // Check after comparing first character to see if strings are equivalent
10324     Label LSkip2;
10325     // Check if the strings start at same location
10326     cmpptr(str1, str2);
10327     jccb(Assembler::notEqual, LSkip2);
10328 
10329     // Check if the length difference is zero (from stack)
10330     cmpl(Address(rsp, 0), 0x0);
10331     jcc(Assembler::equal,  LENGTH_DIFF_LABEL);
10332 
10333     // Strings might not be equivalent
10334     bind(LSkip2);
10335   }
10336 
10337   Address::ScaleFactor scale = Address::times_2;
10338   int stride = 8;
10339 
10340   // Advance to next element
10341   addptr(str1, 16/stride);
10342   addptr(str2, 16/stride);
10343 
10344   if (UseSSE42Intrinsics) {
10345     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
10346     int pcmpmask = 0x19;
10347     // Setup to compare 16-byte vectors
10348     movl(result, cnt2);
10349     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
10350     jccb(Assembler::zero, COMPARE_TAIL);
10351 
10352     lea(str1, Address(str1, result, scale));
10353     lea(str2, Address(str2, result, scale));
10354     negptr(result);
10355 
10356     // pcmpestri
10357     //   inputs:
10358     //     vec1- substring
10359     //     rax - negative string length (elements count)
10360     //     mem - scaned string
10361     //     rdx - string length (elements count)
10362     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
10363     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
10364     //   outputs:
10365     //     rcx - first mismatched element index
10366     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
10367 
10368     bind(COMPARE_WIDE_VECTORS);
10369     movdqu(vec1, Address(str1, result, scale));
10370     pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
10371     // After pcmpestri cnt1(rcx) contains mismatched element index
10372 
10373     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
10374     addptr(result, stride);
10375     subptr(cnt2, stride);
10376     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
10377 
10378     // compare wide vectors tail
10379     testl(result, result);
10380     jccb(Assembler::zero, LENGTH_DIFF_LABEL);
10381 
10382     movl(cnt2, stride);
10383     movl(result, stride);
10384     negptr(result);
10385     movdqu(vec1, Address(str1, result, scale));
10386     pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
10387     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
10388 
10389     // Mismatched characters in the vectors
10390     bind(VECTOR_NOT_EQUAL);
10391     addptr(result, cnt1);
10392     movptr(cnt2, result);
10393     load_unsigned_short(result, Address(str1, cnt2, scale));
10394     load_unsigned_short(cnt1, Address(str2, cnt2, scale));
10395     subl(result, cnt1);
10396     jmpb(POP_LABEL);
10397 
10398     bind(COMPARE_TAIL); // limit is zero
10399     movl(cnt2, result);
10400     // Fallthru to tail compare
10401   }
10402 
10403   // Shift str2 and str1 to the end of the arrays, negate min
10404   lea(str1, Address(str1, cnt2, scale, 0));
10405   lea(str2, Address(str2, cnt2, scale, 0));
10406   negptr(cnt2);
10407 
10408   // Compare the rest of the elements
10409   bind(WHILE_HEAD_LABEL);
10410   load_unsigned_short(result, Address(str1, cnt2, scale, 0));
10411   load_unsigned_short(cnt1, Address(str2, cnt2, scale, 0));
10412   subl(result, cnt1);
10413   jccb(Assembler::notZero, POP_LABEL);
10414   increment(cnt2);
10415   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
10416 
10417   // Strings are equal up to min length.  Return the length difference.
10418   bind(LENGTH_DIFF_LABEL);
10419   pop(result);
10420   jmpb(DONE_LABEL);
10421 
10422   // Discard the stored length difference
10423   bind(POP_LABEL);
10424   pop(cnt1);
10425 
10426   // That's it
10427   bind(DONE_LABEL);
10428 }
10429 
10430 // Compare char[] arrays aligned to 4 bytes or substrings.
10431 void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Register ary2,
10432                                         Register limit, Register result, Register chr,
10433                                         XMMRegister vec1, XMMRegister vec2) {
10434   ShortBranchVerifier sbv(this);
10435   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR;
10436 
10437   int length_offset  = arrayOopDesc::length_offset_in_bytes();
10438   int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
10439 
10440   // Check the input args
10441   cmpptr(ary1, ary2);
10442   jcc(Assembler::equal, TRUE_LABEL);
10443 
10444   if (is_array_equ) {
10445     // Need additional checks for arrays_equals.
10446     testptr(ary1, ary1);
10447     jcc(Assembler::zero, FALSE_LABEL);
10448     testptr(ary2, ary2);
10449     jcc(Assembler::zero, FALSE_LABEL);
10450 
10451     // Check the lengths
10452     movl(limit, Address(ary1, length_offset));
10453     cmpl(limit, Address(ary2, length_offset));
10454     jcc(Assembler::notEqual, FALSE_LABEL);
10455   }
10456 
10457   // count == 0
10458   testl(limit, limit);
10459   jcc(Assembler::zero, TRUE_LABEL);
10460 
10461   if (is_array_equ) {
10462     // Load array address
10463     lea(ary1, Address(ary1, base_offset));
10464     lea(ary2, Address(ary2, base_offset));
10465   }
10466 
10467   shll(limit, 1);      // byte count != 0
10468   movl(result, limit); // copy
10469 
10470   if (UseSSE42Intrinsics) {
10471     // With SSE4.2, use double quad vector compare
10472     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
10473 
10474     // Compare 16-byte vectors
10475     andl(result, 0x0000000e);  //   tail count (in bytes)
10476     andl(limit, 0xfffffff0);   // vector count (in bytes)
10477     jccb(Assembler::zero, COMPARE_TAIL);
10478 
10479     lea(ary1, Address(ary1, limit, Address::times_1));
10480     lea(ary2, Address(ary2, limit, Address::times_1));
10481     negptr(limit);
10482 
10483     bind(COMPARE_WIDE_VECTORS);
10484     movdqu(vec1, Address(ary1, limit, Address::times_1));
10485     movdqu(vec2, Address(ary2, limit, Address::times_1));
10486     pxor(vec1, vec2);
10487 
10488     ptest(vec1, vec1);
10489     jccb(Assembler::notZero, FALSE_LABEL);
10490     addptr(limit, 16);
10491     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
10492 
10493     testl(result, result);
10494     jccb(Assembler::zero, TRUE_LABEL);
10495 
10496     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
10497     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
10498     pxor(vec1, vec2);
10499 
10500     ptest(vec1, vec1);
10501     jccb(Assembler::notZero, FALSE_LABEL);
10502     jmpb(TRUE_LABEL);
10503 
10504     bind(COMPARE_TAIL); // limit is zero
10505     movl(limit, result);
10506     // Fallthru to tail compare
10507   }
10508 
10509   // Compare 4-byte vectors
10510   andl(limit, 0xfffffffc); // vector count (in bytes)
10511   jccb(Assembler::zero, COMPARE_CHAR);
10512 
10513   lea(ary1, Address(ary1, limit, Address::times_1));
10514   lea(ary2, Address(ary2, limit, Address::times_1));
10515   negptr(limit);
10516 
10517   bind(COMPARE_VECTORS);
10518   movl(chr, Address(ary1, limit, Address::times_1));
10519   cmpl(chr, Address(ary2, limit, Address::times_1));
10520   jccb(Assembler::notEqual, FALSE_LABEL);
10521   addptr(limit, 4);
10522   jcc(Assembler::notZero, COMPARE_VECTORS);
10523 
10524   // Compare trailing char (final 2 bytes), if any
10525   bind(COMPARE_CHAR);
10526   testl(result, 0x2);   // tail  char
10527   jccb(Assembler::zero, TRUE_LABEL);
10528   load_unsigned_short(chr, Address(ary1, 0));
10529   load_unsigned_short(limit, Address(ary2, 0));
10530   cmpl(chr, limit);
10531   jccb(Assembler::notEqual, FALSE_LABEL);
10532 
10533   bind(TRUE_LABEL);
10534   movl(result, 1);   // return true
10535   jmpb(DONE);
10536 
10537   bind(FALSE_LABEL);
10538   xorl(result, result); // return false
10539 
10540   // That's it
10541   bind(DONE);
10542 }
10543 
10544 #ifdef PRODUCT
10545 #define BLOCK_COMMENT(str) /* nothing */
10546 #else
10547 #define BLOCK_COMMENT(str) block_comment(str)
10548 #endif
10549 
10550 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
10551 void MacroAssembler::generate_fill(BasicType t, bool aligned,
10552                                    Register to, Register value, Register count,
10553                                    Register rtmp, XMMRegister xtmp) {
10554   ShortBranchVerifier sbv(this);
10555   assert_different_registers(to, value, count, rtmp);
10556   Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
10557   Label L_fill_2_bytes, L_fill_4_bytes;
10558 
10559   int shift = -1;
10560   switch (t) {
10561     case T_BYTE:
10562       shift = 2;
10563       break;
10564     case T_SHORT:
10565       shift = 1;
10566       break;
10567     case T_INT:
10568       shift = 0;
10569       break;
10570     default: ShouldNotReachHere();
10571   }
10572 
10573   if (t == T_BYTE) {
10574     andl(value, 0xff);
10575     movl(rtmp, value);
10576     shll(rtmp, 8);
10577     orl(value, rtmp);
10578   }
10579   if (t == T_SHORT) {
10580     andl(value, 0xffff);
10581   }
10582   if (t == T_BYTE || t == T_SHORT) {
10583     movl(rtmp, value);
10584     shll(rtmp, 16);
10585     orl(value, rtmp);
10586   }
10587 
10588   cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
10589   jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
10590   if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
10591     // align source address at 4 bytes address boundary
10592     if (t == T_BYTE) {
10593       // One byte misalignment happens only for byte arrays
10594       testptr(to, 1);
10595       jccb(Assembler::zero, L_skip_align1);
10596       movb(Address(to, 0), value);
10597       increment(to);
10598       decrement(count);
10599       BIND(L_skip_align1);
10600     }
10601     // Two bytes misalignment happens only for byte and short (char) arrays
10602     testptr(to, 2);
10603     jccb(Assembler::zero, L_skip_align2);
10604     movw(Address(to, 0), value);
10605     addptr(to, 2);
10606     subl(count, 1<<(shift-1));
10607     BIND(L_skip_align2);
10608   }
10609   if (UseSSE < 2) {
10610     Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
10611     // Fill 32-byte chunks
10612     subl(count, 8 << shift);
10613     jcc(Assembler::less, L_check_fill_8_bytes);
10614     align(16);
10615 
10616     BIND(L_fill_32_bytes_loop);
10617 
10618     for (int i = 0; i < 32; i += 4) {
10619       movl(Address(to, i), value);
10620     }
10621 
10622     addptr(to, 32);
10623     subl(count, 8 << shift);
10624     jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
10625     BIND(L_check_fill_8_bytes);
10626     addl(count, 8 << shift);
10627     jccb(Assembler::zero, L_exit);
10628     jmpb(L_fill_8_bytes);
10629 
10630     //
10631     // length is too short, just fill qwords
10632     //
10633     BIND(L_fill_8_bytes_loop);
10634     movl(Address(to, 0), value);
10635     movl(Address(to, 4), value);
10636     addptr(to, 8);
10637     BIND(L_fill_8_bytes);
10638     subl(count, 1 << (shift + 1));
10639     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
10640     // fall through to fill 4 bytes
10641   } else {
10642     Label L_fill_32_bytes;
10643     if (!UseUnalignedLoadStores) {
10644       // align to 8 bytes, we know we are 4 byte aligned to start
10645       testptr(to, 4);
10646       jccb(Assembler::zero, L_fill_32_bytes);
10647       movl(Address(to, 0), value);
10648       addptr(to, 4);
10649       subl(count, 1<<shift);
10650     }
10651     BIND(L_fill_32_bytes);
10652     {
10653       assert( UseSSE >= 2, "supported cpu only" );
10654       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
10655       // Fill 32-byte chunks
10656       movdl(xtmp, value);
10657       pshufd(xtmp, xtmp, 0);
10658 
10659       subl(count, 8 << shift);
10660       jcc(Assembler::less, L_check_fill_8_bytes);
10661       align(16);
10662 
10663       BIND(L_fill_32_bytes_loop);
10664 
10665       if (UseUnalignedLoadStores) {
10666         movdqu(Address(to, 0), xtmp);
10667         movdqu(Address(to, 16), xtmp);
10668       } else {
10669         movq(Address(to, 0), xtmp);
10670         movq(Address(to, 8), xtmp);
10671         movq(Address(to, 16), xtmp);
10672         movq(Address(to, 24), xtmp);
10673       }
10674 
10675       addptr(to, 32);
10676       subl(count, 8 << shift);
10677       jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
10678       BIND(L_check_fill_8_bytes);
10679       addl(count, 8 << shift);
10680       jccb(Assembler::zero, L_exit);
10681       jmpb(L_fill_8_bytes);
10682 
10683       //
10684       // length is too short, just fill qwords
10685       //
10686       BIND(L_fill_8_bytes_loop);
10687       movq(Address(to, 0), xtmp);
10688       addptr(to, 8);
10689       BIND(L_fill_8_bytes);
10690       subl(count, 1 << (shift + 1));
10691       jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
10692     }
10693   }
10694   // fill trailing 4 bytes
10695   BIND(L_fill_4_bytes);
10696   testl(count, 1<<shift);
10697   jccb(Assembler::zero, L_fill_2_bytes);
10698   movl(Address(to, 0), value);
10699   if (t == T_BYTE || t == T_SHORT) {
10700     addptr(to, 4);
10701     BIND(L_fill_2_bytes);
10702     // fill trailing 2 bytes
10703     testl(count, 1<<(shift-1));
10704     jccb(Assembler::zero, L_fill_byte);
10705     movw(Address(to, 0), value);
10706     if (t == T_BYTE) {
10707       addptr(to, 2);
10708       BIND(L_fill_byte);
10709       // fill trailing byte
10710       testl(count, 1);
10711       jccb(Assembler::zero, L_exit);
10712       movb(Address(to, 0), value);
10713     } else {
10714       BIND(L_fill_byte);
10715     }
10716   } else {
10717     BIND(L_fill_2_bytes);
10718   }
10719   BIND(L_exit);
10720 }
10721 #undef BIND
10722 #undef BLOCK_COMMENT
10723 
10724 
10725 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
10726   switch (cond) {
10727     // Note some conditions are synonyms for others
10728     case Assembler::zero:         return Assembler::notZero;
10729     case Assembler::notZero:      return Assembler::zero;
10730     case Assembler::less:         return Assembler::greaterEqual;
10731     case Assembler::lessEqual:    return Assembler::greater;
10732     case Assembler::greater:      return Assembler::lessEqual;
10733     case Assembler::greaterEqual: return Assembler::less;
10734     case Assembler::below:        return Assembler::aboveEqual;
10735     case Assembler::belowEqual:   return Assembler::above;
10736     case Assembler::above:        return Assembler::belowEqual;
10737     case Assembler::aboveEqual:   return Assembler::below;
10738     case Assembler::overflow:     return Assembler::noOverflow;
10739     case Assembler::noOverflow:   return Assembler::overflow;
10740     case Assembler::negative:     return Assembler::positive;
10741     case Assembler::positive:     return Assembler::negative;
10742     case Assembler::parity:       return Assembler::noParity;
10743     case Assembler::noParity:     return Assembler::parity;
10744   }
10745   ShouldNotReachHere(); return Assembler::overflow;
10746 }
10747 
10748 SkipIfEqual::SkipIfEqual(
10749     MacroAssembler* masm, const bool* flag_addr, bool value) {
10750   _masm = masm;
10751   _masm->cmp8(ExternalAddress((address)flag_addr), value);
10752   _masm->jcc(Assembler::equal, _label);
10753 }
10754 
10755 SkipIfEqual::~SkipIfEqual() {
10756   _masm->bind(_label);
10757 }