New src/cpu/x86/vm/assembler

   1 /*
   2  * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "assembler_x86.inline.hpp"
  27 #include "gc_interface/collectedHeap.inline.hpp"
  28 #include "interpreter/interpreter.hpp"
  29 #include "memory/cardTableModRefBS.hpp"
  30 #include "memory/resourceArea.hpp"
  31 #include "prims/methodHandles.hpp"
  32 #include "runtime/biasedLocking.hpp"
  33 #include "runtime/interfaceSupport.hpp"
  34 #include "runtime/objectMonitor.hpp"
  35 #include "runtime/os.hpp"
  36 #include "runtime/sharedRuntime.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #ifndef SERIALGC
  39 #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
  40 #include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
  41 #include "gc_implementation/g1/heapRegion.hpp"
  42 #endif
  43 
  44 // Implementation of AddressLiteral
  45 
  46 AddressLiteral::AddressLiteral(address target, relocInfo::relocType rtype) {
  47   _is_lval = false;
  48   _target = target;
  49   switch (rtype) {
  50   case relocInfo::oop_type:
  51     // Oops are a special case. Normally they would be their own section
  52     // but in cases like icBuffer they are literals in the code stream that
  53     // we don't have a section for. We use none so that we get a literal address
  54     // which is always patchable.
  55     break;
  56   case relocInfo::external_word_type:
  57     _rspec = external_word_Relocation::spec(target);
  58     break;
  59   case relocInfo::internal_word_type:
  60     _rspec = internal_word_Relocation::spec(target);
  61     break;
  62   case relocInfo::opt_virtual_call_type:
  63     _rspec = opt_virtual_call_Relocation::spec();
  64     break;
  65   case relocInfo::static_call_type:
  66     _rspec = static_call_Relocation::spec();
  67     break;
  68   case relocInfo::runtime_call_type:
  69     _rspec = runtime_call_Relocation::spec();
  70     break;
  71   case relocInfo::poll_type:
  72   case relocInfo::poll_return_type:
  73     _rspec = Relocation::spec_simple(rtype);
  74     break;
  75   case relocInfo::none:
  76     break;
  77   default:
  78     ShouldNotReachHere();
  79     break;
  80   }
  81 }
  82 
  83 // Implementation of Address
  84 
  85 #ifdef _LP64
  86 
  87 Address Address::make_array(ArrayAddress adr) {
  88   // Not implementable on 64bit machines
  89   // Should have been handled higher up the call chain.
  90   ShouldNotReachHere();
  91   return Address();
  92 }
  93 
  94 // exceedingly dangerous constructor
  95 Address::Address(int disp, address loc, relocInfo::relocType rtype) {
  96   _base  = noreg;
  97   _index = noreg;
  98   _scale = no_scale;
  99   _disp  = disp;
 100   switch (rtype) {
 101     case relocInfo::external_word_type:
 102       _rspec = external_word_Relocation::spec(loc);
 103       break;
 104     case relocInfo::internal_word_type:
 105       _rspec = internal_word_Relocation::spec(loc);
 106       break;
 107     case relocInfo::runtime_call_type:
 108       // HMM
 109       _rspec = runtime_call_Relocation::spec();
 110       break;
 111     case relocInfo::poll_type:
 112     case relocInfo::poll_return_type:
 113       _rspec = Relocation::spec_simple(rtype);
 114       break;
 115     case relocInfo::none:
 116       break;
 117     default:
 118       ShouldNotReachHere();
 119   }
 120 }
 121 #else // LP64
 122 
 123 Address Address::make_array(ArrayAddress adr) {
 124   AddressLiteral base = adr.base();
 125   Address index = adr.index();
 126   assert(index._disp == 0, "must not have disp"); // maybe it can?
 127   Address array(index._base, index._index, index._scale, (intptr_t) base.target());
 128   array._rspec = base._rspec;
 129   return array;
 130 }
 131 
 132 // exceedingly dangerous constructor
 133 Address::Address(address loc, RelocationHolder spec) {
 134   _base  = noreg;
 135   _index = noreg;
 136   _scale = no_scale;
 137   _disp  = (intptr_t) loc;
 138   _rspec = spec;
 139 }
 140 
 141 #endif // _LP64
 142 
 143 
 144 
 145 // Convert the raw encoding form into the form expected by the constructor for
 146 // Address.  An index of 4 (rsp) corresponds to having no index, so convert
 147 // that to noreg for the Address constructor.
 148 Address Address::make_raw(int base, int index, int scale, int disp, bool disp_is_oop) {
 149   RelocationHolder rspec;
 150   if (disp_is_oop) {
 151     rspec = Relocation::spec_simple(relocInfo::oop_type);
 152   }
 153   bool valid_index = index != rsp->encoding();
 154   if (valid_index) {
 155     Address madr(as_Register(base), as_Register(index), (Address::ScaleFactor)scale, in_ByteSize(disp));
 156     madr._rspec = rspec;
 157     return madr;
 158   } else {
 159     Address madr(as_Register(base), noreg, Address::no_scale, in_ByteSize(disp));
 160     madr._rspec = rspec;
 161     return madr;
 162   }
 163 }
 164 
 165 // Implementation of Assembler
 166 
 167 int AbstractAssembler::code_fill_byte() {
 168   return (u_char)'\xF4'; // hlt
 169 }
 170 
 171 // make this go away someday
 172 void Assembler::emit_data(jint data, relocInfo::relocType rtype, int format) {
 173   if (rtype == relocInfo::none)
 174         emit_long(data);
 175   else  emit_data(data, Relocation::spec_simple(rtype), format);
 176 }
 177 
 178 void Assembler::emit_data(jint data, RelocationHolder const& rspec, int format) {
 179   assert(imm_operand == 0, "default format must be immediate in this file");
 180   assert(inst_mark() != NULL, "must be inside InstructionMark");
 181   if (rspec.type() !=  relocInfo::none) {
 182     #ifdef ASSERT
 183       check_relocation(rspec, format);
 184     #endif
 185     // Do not use AbstractAssembler::relocate, which is not intended for
 186     // embedded words.  Instead, relocate to the enclosing instruction.
 187 
 188     // hack. call32 is too wide for mask so use disp32
 189     if (format == call32_operand)
 190       code_section()->relocate(inst_mark(), rspec, disp32_operand);
 191     else
 192       code_section()->relocate(inst_mark(), rspec, format);
 193   }
 194   emit_long(data);
 195 }
 196 
 197 static int encode(Register r) {
 198   int enc = r->encoding();
 199   if (enc >= 8) {
 200     enc -= 8;
 201   }
 202   return enc;
 203 }
 204 
 205 static int encode(XMMRegister r) {
 206   int enc = r->encoding();
 207   if (enc >= 8) {
 208     enc -= 8;
 209   }
 210   return enc;
 211 }
 212 
 213 void Assembler::emit_arith_b(int op1, int op2, Register dst, int imm8) {
 214   assert(dst->has_byte_register(), "must have byte register");
 215   assert(isByte(op1) && isByte(op2), "wrong opcode");
 216   assert(isByte(imm8), "not a byte");
 217   assert((op1 & 0x01) == 0, "should be 8bit operation");
 218   emit_byte(op1);
 219   emit_byte(op2 | encode(dst));
 220   emit_byte(imm8);
 221 }
 222 
 223 
 224 void Assembler::emit_arith(int op1, int op2, Register dst, int32_t imm32) {
 225   assert(isByte(op1) && isByte(op2), "wrong opcode");
 226   assert((op1 & 0x01) == 1, "should be 32bit operation");
 227   assert((op1 & 0x02) == 0, "sign-extension bit should not be set");
 228   if (is8bit(imm32)) {
 229     emit_byte(op1 | 0x02); // set sign bit
 230     emit_byte(op2 | encode(dst));
 231     emit_byte(imm32 & 0xFF);
 232   } else {
 233     emit_byte(op1);
 234     emit_byte(op2 | encode(dst));
 235     emit_long(imm32);
 236   }
 237 }
 238 
 239 // Force generation of a 4 byte immediate value even if it fits into 8bit
 240 void Assembler::emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32) {
 241   assert(isByte(op1) && isByte(op2), "wrong opcode");
 242   assert((op1 & 0x01) == 1, "should be 32bit operation");
 243   assert((op1 & 0x02) == 0, "sign-extension bit should not be set");
 244   emit_byte(op1);
 245   emit_byte(op2 | encode(dst));
 246   emit_long(imm32);
 247 }
 248 
 249 // immediate-to-memory forms
 250 void Assembler::emit_arith_operand(int op1, Register rm, Address adr, int32_t imm32) {
 251   assert((op1 & 0x01) == 1, "should be 32bit operation");
 252   assert((op1 & 0x02) == 0, "sign-extension bit should not be set");
 253   if (is8bit(imm32)) {
 254     emit_byte(op1 | 0x02); // set sign bit
 255     emit_operand(rm, adr, 1);
 256     emit_byte(imm32 & 0xFF);
 257   } else {
 258     emit_byte(op1);
 259     emit_operand(rm, adr, 4);
 260     emit_long(imm32);
 261   }
 262 }
 263 
 264 void Assembler::emit_arith(int op1, int op2, Register dst, jobject obj) {
 265   LP64_ONLY(ShouldNotReachHere());
 266   assert(isByte(op1) && isByte(op2), "wrong opcode");
 267   assert((op1 & 0x01) == 1, "should be 32bit operation");
 268   assert((op1 & 0x02) == 0, "sign-extension bit should not be set");
 269   InstructionMark im(this);
 270   emit_byte(op1);
 271   emit_byte(op2 | encode(dst));
 272   emit_data((intptr_t)obj, relocInfo::oop_type, 0);
 273 }
 274 
 275 
 276 void Assembler::emit_arith(int op1, int op2, Register dst, Register src) {
 277   assert(isByte(op1) && isByte(op2), "wrong opcode");
 278   emit_byte(op1);
 279   emit_byte(op2 | encode(dst) << 3 | encode(src));
 280 }
 281 
 282 
 283 void Assembler::emit_operand(Register reg, Register base, Register index,
 284                              Address::ScaleFactor scale, int disp,
 285                              RelocationHolder const& rspec,
 286                              int rip_relative_correction) {
 287   relocInfo::relocType rtype = (relocInfo::relocType) rspec.type();
 288 
 289   // Encode the registers as needed in the fields they are used in
 290 
 291   int regenc = encode(reg) << 3;
 292   int indexenc = index->is_valid() ? encode(index) << 3 : 0;
 293   int baseenc = base->is_valid() ? encode(base) : 0;
 294 
 295   if (base->is_valid()) {
 296     if (index->is_valid()) {
 297       assert(scale != Address::no_scale, "inconsistent address");
 298       // [base + index*scale + disp]
 299       if (disp == 0 && rtype == relocInfo::none  &&
 300           base != rbp LP64_ONLY(&& base != r13)) {
 301         // [base + index*scale]
 302         // [00 reg 100][ss index base]
 303         assert(index != rsp, "illegal addressing mode");
 304         emit_byte(0x04 | regenc);
 305         emit_byte(scale << 6 | indexenc | baseenc);
 306       } else if (is8bit(disp) && rtype == relocInfo::none) {
 307         // [base + index*scale + imm8]
 308         // [01 reg 100][ss index base] imm8
 309         assert(index != rsp, "illegal addressing mode");
 310         emit_byte(0x44 | regenc);
 311         emit_byte(scale << 6 | indexenc | baseenc);
 312         emit_byte(disp & 0xFF);
 313       } else {
 314         // [base + index*scale + disp32]
 315         // [10 reg 100][ss index base] disp32
 316         assert(index != rsp, "illegal addressing mode");
 317         emit_byte(0x84 | regenc);
 318         emit_byte(scale << 6 | indexenc | baseenc);
 319         emit_data(disp, rspec, disp32_operand);
 320       }
 321     } else if (base == rsp LP64_ONLY(|| base == r12)) {
 322       // [rsp + disp]
 323       if (disp == 0 && rtype == relocInfo::none) {
 324         // [rsp]
 325         // [00 reg 100][00 100 100]
 326         emit_byte(0x04 | regenc);
 327         emit_byte(0x24);
 328       } else if (is8bit(disp) && rtype == relocInfo::none) {
 329         // [rsp + imm8]
 330         // [01 reg 100][00 100 100] disp8
 331         emit_byte(0x44 | regenc);
 332         emit_byte(0x24);
 333         emit_byte(disp & 0xFF);
 334       } else {
 335         // [rsp + imm32]
 336         // [10 reg 100][00 100 100] disp32
 337         emit_byte(0x84 | regenc);
 338         emit_byte(0x24);
 339         emit_data(disp, rspec, disp32_operand);
 340       }
 341     } else {
 342       // [base + disp]
 343       assert(base != rsp LP64_ONLY(&& base != r12), "illegal addressing mode");
 344       if (disp == 0 && rtype == relocInfo::none &&
 345           base != rbp LP64_ONLY(&& base != r13)) {
 346         // [base]
 347         // [00 reg base]
 348         emit_byte(0x00 | regenc | baseenc);
 349       } else if (is8bit(disp) && rtype == relocInfo::none) {
 350         // [base + disp8]
 351         // [01 reg base] disp8
 352         emit_byte(0x40 | regenc | baseenc);
 353         emit_byte(disp & 0xFF);
 354       } else {
 355         // [base + disp32]
 356         // [10 reg base] disp32
 357         emit_byte(0x80 | regenc | baseenc);
 358         emit_data(disp, rspec, disp32_operand);
 359       }
 360     }
 361   } else {
 362     if (index->is_valid()) {
 363       assert(scale != Address::no_scale, "inconsistent address");
 364       // [index*scale + disp]
 365       // [00 reg 100][ss index 101] disp32
 366       assert(index != rsp, "illegal addressing mode");
 367       emit_byte(0x04 | regenc);
 368       emit_byte(scale << 6 | indexenc | 0x05);
 369       emit_data(disp, rspec, disp32_operand);
 370     } else if (rtype != relocInfo::none ) {
 371       // [disp] (64bit) RIP-RELATIVE (32bit) abs
 372       // [00 000 101] disp32
 373 
 374       emit_byte(0x05 | regenc);
 375       // Note that the RIP-rel. correction applies to the generated
 376       // disp field, but _not_ to the target address in the rspec.
 377 
 378       // disp was created by converting the target address minus the pc
 379       // at the start of the instruction. That needs more correction here.
 380       // intptr_t disp = target - next_ip;
 381       assert(inst_mark() != NULL, "must be inside InstructionMark");
 382       address next_ip = pc() + sizeof(int32_t) + rip_relative_correction;
 383       int64_t adjusted = disp;
 384       // Do rip-rel adjustment for 64bit
 385       LP64_ONLY(adjusted -=  (next_ip - inst_mark()));
 386       assert(is_simm32(adjusted),
 387              "must be 32bit offset (RIP relative address)");
 388       emit_data((int32_t) adjusted, rspec, disp32_operand);
 389 
 390     } else {
 391       // 32bit never did this, did everything as the rip-rel/disp code above
 392       // [disp] ABSOLUTE
 393       // [00 reg 100][00 100 101] disp32
 394       emit_byte(0x04 | regenc);
 395       emit_byte(0x25);
 396       emit_data(disp, rspec, disp32_operand);
 397     }
 398   }
 399 }
 400 
 401 void Assembler::emit_operand(XMMRegister reg, Register base, Register index,
 402                              Address::ScaleFactor scale, int disp,
 403                              RelocationHolder const& rspec) {
 404   emit_operand((Register)reg, base, index, scale, disp, rspec);
 405 }
 406 
 407 // Secret local extension to Assembler::WhichOperand:
 408 #define end_pc_operand (_WhichOperand_limit)
 409 
 410 address Assembler::locate_operand(address inst, WhichOperand which) {
 411   // Decode the given instruction, and return the address of
 412   // an embedded 32-bit operand word.
 413 
 414   // If "which" is disp32_operand, selects the displacement portion
 415   // of an effective address specifier.
 416   // If "which" is imm64_operand, selects the trailing immediate constant.
 417   // If "which" is call32_operand, selects the displacement of a call or jump.
 418   // Caller is responsible for ensuring that there is such an operand,
 419   // and that it is 32/64 bits wide.
 420 
 421   // If "which" is end_pc_operand, find the end of the instruction.
 422 
 423   address ip = inst;
 424   bool is_64bit = false;
 425 
 426   debug_only(bool has_disp32 = false);
 427   int tail_size = 0; // other random bytes (#32, #16, etc.) at end of insn
 428 
 429   again_after_prefix:
 430   switch (0xFF & *ip++) {
 431 
 432   // These convenience macros generate groups of "case" labels for the switch.
 433 #define REP4(x) (x)+0: case (x)+1: case (x)+2: case (x)+3
 434 #define REP8(x) (x)+0: case (x)+1: case (x)+2: case (x)+3: \
 435              case (x)+4: case (x)+5: case (x)+6: case (x)+7
 436 #define REP16(x) REP8((x)+0): \
 437               case REP8((x)+8)
 438 
 439   case CS_segment:
 440   case SS_segment:
 441   case DS_segment:
 442   case ES_segment:
 443   case FS_segment:
 444   case GS_segment:
 445     // Seems dubious
 446     LP64_ONLY(assert(false, "shouldn't have that prefix"));
 447     assert(ip == inst+1, "only one prefix allowed");
 448     goto again_after_prefix;
 449 
 450   case 0x67:
 451   case REX:
 452   case REX_B:
 453   case REX_X:
 454   case REX_XB:
 455   case REX_R:
 456   case REX_RB:
 457   case REX_RX:
 458   case REX_RXB:
 459     NOT_LP64(assert(false, "64bit prefixes"));
 460     goto again_after_prefix;
 461 
 462   case REX_W:
 463   case REX_WB:
 464   case REX_WX:
 465   case REX_WXB:
 466   case REX_WR:
 467   case REX_WRB:
 468   case REX_WRX:
 469   case REX_WRXB:
 470     NOT_LP64(assert(false, "64bit prefixes"));
 471     is_64bit = true;
 472     goto again_after_prefix;
 473 
 474   case 0xFF: // pushq a; decl a; incl a; call a; jmp a
 475   case 0x88: // movb a, r
 476   case 0x89: // movl a, r
 477   case 0x8A: // movb r, a
 478   case 0x8B: // movl r, a
 479   case 0x8F: // popl a
 480     debug_only(has_disp32 = true);
 481     break;
 482 
 483   case 0x68: // pushq #32
 484     if (which == end_pc_operand) {
 485       return ip + 4;
 486     }
 487     assert(which == imm_operand && !is_64bit, "pushl has no disp32 or 64bit immediate");
 488     return ip;                  // not produced by emit_operand
 489 
 490   case 0x66: // movw ... (size prefix)
 491     again_after_size_prefix2:
 492     switch (0xFF & *ip++) {
 493     case REX:
 494     case REX_B:
 495     case REX_X:
 496     case REX_XB:
 497     case REX_R:
 498     case REX_RB:
 499     case REX_RX:
 500     case REX_RXB:
 501     case REX_W:
 502     case REX_WB:
 503     case REX_WX:
 504     case REX_WXB:
 505     case REX_WR:
 506     case REX_WRB:
 507     case REX_WRX:
 508     case REX_WRXB:
 509       NOT_LP64(assert(false, "64bit prefix found"));
 510       goto again_after_size_prefix2;
 511     case 0x8B: // movw r, a
 512     case 0x89: // movw a, r
 513       debug_only(has_disp32 = true);
 514       break;
 515     case 0xC7: // movw a, #16
 516       debug_only(has_disp32 = true);
 517       tail_size = 2;  // the imm16
 518       break;
 519     case 0x0F: // several SSE/SSE2 variants
 520       ip--;    // reparse the 0x0F
 521       goto again_after_prefix;
 522     default:
 523       ShouldNotReachHere();
 524     }
 525     break;
 526 
 527   case REP8(0xB8): // movl/q r, #32/#64(oop?)
 528     if (which == end_pc_operand)  return ip + (is_64bit ? 8 : 4);
 529     // these asserts are somewhat nonsensical
 530 #ifndef _LP64
 531     assert(which == imm_operand || which == disp32_operand,
 532            err_msg("which %d is_64_bit %d ip " INTPTR_FORMAT, which, is_64bit, ip));
 533 #else
 534     assert((which == call32_operand || which == imm_operand) && is_64bit ||
 535            which == narrow_oop_operand && !is_64bit,
 536            err_msg("which %d is_64_bit %d ip " INTPTR_FORMAT, which, is_64bit, ip));
 537 #endif // _LP64
 538     return ip;
 539 
 540   case 0x69: // imul r, a, #32
 541   case 0xC7: // movl a, #32(oop?)
 542     tail_size = 4;
 543     debug_only(has_disp32 = true); // has both kinds of operands!
 544     break;
 545 
 546   case 0x0F: // movx..., etc.
 547     switch (0xFF & *ip++) {
 548     case 0x3A: // pcmpestri
 549       tail_size = 1;
 550     case 0x38: // ptest, pmovzxbw
 551       ip++; // skip opcode
 552       debug_only(has_disp32 = true); // has both kinds of operands!
 553       break;
 554 
 555     case 0x70: // pshufd r, r/a, #8
 556       debug_only(has_disp32 = true); // has both kinds of operands!
 557     case 0x73: // psrldq r, #8
 558       tail_size = 1;
 559       break;
 560 
 561     case 0x12: // movlps
 562     case 0x28: // movaps
 563     case 0x2E: // ucomiss
 564     case 0x2F: // comiss
 565     case 0x54: // andps
 566     case 0x55: // andnps
 567     case 0x56: // orps
 568     case 0x57: // xorps
 569     case 0x6E: // movd
 570     case 0x7E: // movd
 571     case 0xAE: // ldmxcsr, stmxcsr, fxrstor, fxsave, clflush
 572       debug_only(has_disp32 = true);
 573       break;
 574 
 575     case 0xAD: // shrd r, a, %cl
 576     case 0xAF: // imul r, a
 577     case 0xBE: // movsbl r, a (movsxb)
 578     case 0xBF: // movswl r, a (movsxw)
 579     case 0xB6: // movzbl r, a (movzxb)
 580     case 0xB7: // movzwl r, a (movzxw)
 581     case REP16(0x40): // cmovl cc, r, a
 582     case 0xB0: // cmpxchgb
 583     case 0xB1: // cmpxchg
 584     case 0xC1: // xaddl
 585     case 0xC7: // cmpxchg8
 586     case REP16(0x90): // setcc a
 587       debug_only(has_disp32 = true);
 588       // fall out of the switch to decode the address
 589       break;
 590 
 591     case 0xC4: // pinsrw r, a, #8
 592       debug_only(has_disp32 = true);
 593     case 0xC5: // pextrw r, r, #8
 594       tail_size = 1;  // the imm8
 595       break;
 596 
 597     case 0xAC: // shrd r, a, #8
 598       debug_only(has_disp32 = true);
 599       tail_size = 1;  // the imm8
 600       break;
 601 
 602     case REP16(0x80): // jcc rdisp32
 603       if (which == end_pc_operand)  return ip + 4;
 604       assert(which == call32_operand, "jcc has no disp32 or imm");
 605       return ip;
 606     default:
 607       ShouldNotReachHere();
 608     }
 609     break;
 610 
 611   case 0x81: // addl a, #32; addl r, #32
 612     // also: orl, adcl, sbbl, andl, subl, xorl, cmpl
 613     // on 32bit in the case of cmpl, the imm might be an oop
 614     tail_size = 4;
 615     debug_only(has_disp32 = true); // has both kinds of operands!
 616     break;
 617 
 618   case 0x83: // addl a, #8; addl r, #8
 619     // also: orl, adcl, sbbl, andl, subl, xorl, cmpl
 620     debug_only(has_disp32 = true); // has both kinds of operands!
 621     tail_size = 1;
 622     break;
 623 
 624   case 0x9B:
 625     switch (0xFF & *ip++) {
 626     case 0xD9: // fnstcw a
 627       debug_only(has_disp32 = true);
 628       break;
 629     default:
 630       ShouldNotReachHere();
 631     }
 632     break;
 633 
 634   case REP4(0x00): // addb a, r; addl a, r; addb r, a; addl r, a
 635   case REP4(0x10): // adc...
 636   case REP4(0x20): // and...
 637   case REP4(0x30): // xor...
 638   case REP4(0x08): // or...
 639   case REP4(0x18): // sbb...
 640   case REP4(0x28): // sub...
 641   case 0xF7: // mull a
 642   case 0x8D: // lea r, a
 643   case 0x87: // xchg r, a
 644   case REP4(0x38): // cmp...
 645   case 0x85: // test r, a
 646     debug_only(has_disp32 = true); // has both kinds of operands!
 647     break;
 648 
 649   case 0xC1: // sal a, #8; sar a, #8; shl a, #8; shr a, #8
 650   case 0xC6: // movb a, #8
 651   case 0x80: // cmpb a, #8
 652   case 0x6B: // imul r, a, #8
 653     debug_only(has_disp32 = true); // has both kinds of operands!
 654     tail_size = 1; // the imm8
 655     break;
 656 
 657   case 0xC4: // VEX_3bytes
 658   case 0xC5: // VEX_2bytes
 659     assert((UseAVX > 0), "shouldn't have VEX prefix");
 660     assert(ip == inst+1, "no prefixes allowed");
 661     // C4 and C5 are also used as opcodes for PINSRW and PEXTRW instructions
 662     // but they have prefix 0x0F and processed when 0x0F processed above.
 663     //
 664     // In 32-bit mode the VEX first byte C4 and C5 alias onto LDS and LES
 665     // instructions (these instructions are not supported in 64-bit mode).
 666     // To distinguish them bits [7:6] are set in the VEX second byte since
 667     // ModRM byte can not be of the form 11xxxxxx in 32-bit mode. To set
 668     // those VEX bits REX and vvvv bits are inverted.
 669     //
 670     // Fortunately C2 doesn't generate these instructions so we don't need
 671     // to check for them in product version.
 672 
 673     // Check second byte
 674     NOT_LP64(assert((0xC0 & *ip) == 0xC0, "shouldn't have LDS and LES instructions"));
 675 
 676     // First byte
 677     if ((0xFF & *inst) == VEX_3bytes) {
 678       ip++; // third byte
 679       is_64bit = ((VEX_W & *ip) == VEX_W);
 680     }
 681     ip++; // opcode
 682     // To find the end of instruction (which == end_pc_operand).
 683     switch (0xFF & *ip) {
 684     case 0x61: // pcmpestri r, r/a, #8
 685     case 0x70: // pshufd r, r/a, #8
 686     case 0x73: // psrldq r, #8
 687       tail_size = 1;  // the imm8
 688       break;
 689     default:
 690       break;
 691     }
 692     ip++; // skip opcode
 693     debug_only(has_disp32 = true); // has both kinds of operands!
 694     break;
 695 
 696   case 0xD1: // sal a, 1; sar a, 1; shl a, 1; shr a, 1
 697   case 0xD3: // sal a, %cl; sar a, %cl; shl a, %cl; shr a, %cl
 698   case 0xD9: // fld_s a; fst_s a; fstp_s a; fldcw a
 699   case 0xDD: // fld_d a; fst_d a; fstp_d a
 700   case 0xDB: // fild_s a; fistp_s a; fld_x a; fstp_x a
 701   case 0xDF: // fild_d a; fistp_d a
 702   case 0xD8: // fadd_s a; fsubr_s a; fmul_s a; fdivr_s a; fcomp_s a
 703   case 0xDC: // fadd_d a; fsubr_d a; fmul_d a; fdivr_d a; fcomp_d a
 704   case 0xDE: // faddp_d a; fsubrp_d a; fmulp_d a; fdivrp_d a; fcompp_d a
 705     debug_only(has_disp32 = true);
 706     break;
 707 
 708   case 0xE8: // call rdisp32
 709   case 0xE9: // jmp  rdisp32
 710     if (which == end_pc_operand)  return ip + 4;
 711     assert(which == call32_operand, "call has no disp32 or imm");
 712     return ip;
 713 
 714   case 0xF0:                    // Lock
 715     assert(os::is_MP(), "only on MP");
 716     goto again_after_prefix;
 717 
 718   case 0xF3:                    // For SSE
 719   case 0xF2:                    // For SSE2
 720     switch (0xFF & *ip++) {
 721     case REX:
 722     case REX_B:
 723     case REX_X:
 724     case REX_XB:
 725     case REX_R:
 726     case REX_RB:
 727     case REX_RX:
 728     case REX_RXB:
 729     case REX_W:
 730     case REX_WB:
 731     case REX_WX:
 732     case REX_WXB:
 733     case REX_WR:
 734     case REX_WRB:
 735     case REX_WRX:
 736     case REX_WRXB:
 737       NOT_LP64(assert(false, "found 64bit prefix"));
 738       ip++;
 739     default:
 740       ip++;
 741     }
 742     debug_only(has_disp32 = true); // has both kinds of operands!
 743     break;
 744 
 745   default:
 746     ShouldNotReachHere();
 747 
 748 #undef REP8
 749 #undef REP16
 750   }
 751 
 752   assert(which != call32_operand, "instruction is not a call, jmp, or jcc");
 753 #ifdef _LP64
 754   assert(which != imm_operand, "instruction is not a movq reg, imm64");
 755 #else
 756   // assert(which != imm_operand || has_imm32, "instruction has no imm32 field");
 757   assert(which != imm_operand || has_disp32, "instruction has no imm32 field");
 758 #endif // LP64
 759   assert(which != disp32_operand || has_disp32, "instruction has no disp32 field");
 760 
 761   // parse the output of emit_operand
 762   int op2 = 0xFF & *ip++;
 763   int base = op2 & 0x07;
 764   int op3 = -1;
 765   const int b100 = 4;
 766   const int b101 = 5;
 767   if (base == b100 && (op2 >> 6) != 3) {
 768     op3 = 0xFF & *ip++;
 769     base = op3 & 0x07;   // refetch the base
 770   }
 771   // now ip points at the disp (if any)
 772 
 773   switch (op2 >> 6) {
 774   case 0:
 775     // [00 reg  100][ss index base]
 776     // [00 reg  100][00   100  esp]
 777     // [00 reg base]
 778     // [00 reg  100][ss index  101][disp32]
 779     // [00 reg  101]               [disp32]
 780 
 781     if (base == b101) {
 782       if (which == disp32_operand)
 783         return ip;              // caller wants the disp32
 784       ip += 4;                  // skip the disp32
 785     }
 786     break;
 787 
 788   case 1:
 789     // [01 reg  100][ss index base][disp8]
 790     // [01 reg  100][00   100  esp][disp8]
 791     // [01 reg base]               [disp8]
 792     ip += 1;                    // skip the disp8
 793     break;
 794 
 795   case 2:
 796     // [10 reg  100][ss index base][disp32]
 797     // [10 reg  100][00   100  esp][disp32]
 798     // [10 reg base]               [disp32]
 799     if (which == disp32_operand)
 800       return ip;                // caller wants the disp32
 801     ip += 4;                    // skip the disp32
 802     break;
 803 
 804   case 3:
 805     // [11 reg base]  (not a memory addressing mode)
 806     break;
 807   }
 808 
 809   if (which == end_pc_operand) {
 810     return ip + tail_size;
 811   }
 812 
 813 #ifdef _LP64
 814   assert(which == narrow_oop_operand && !is_64bit, "instruction is not a movl adr, imm32");
 815 #else
 816   assert(which == imm_operand, "instruction has only an imm field");
 817 #endif // LP64
 818   return ip;
 819 }
 820 
 821 address Assembler::locate_next_instruction(address inst) {
 822   // Secretly share code with locate_operand:
 823   return locate_operand(inst, end_pc_operand);
 824 }
 825 
 826 
 827 #ifdef ASSERT
 828 void Assembler::check_relocation(RelocationHolder const& rspec, int format) {
 829   address inst = inst_mark();
 830   assert(inst != NULL && inst < pc(), "must point to beginning of instruction");
 831   address opnd;
 832 
 833   Relocation* r = rspec.reloc();
 834   if (r->type() == relocInfo::none) {
 835     return;
 836   } else if (r->is_call() || format == call32_operand) {
 837     // assert(format == imm32_operand, "cannot specify a nonzero format");
 838     opnd = locate_operand(inst, call32_operand);
 839   } else if (r->is_data()) {
 840     assert(format == imm_operand || format == disp32_operand
 841            LP64_ONLY(|| format == narrow_oop_operand), "format ok");
 842     opnd = locate_operand(inst, (WhichOperand)format);
 843   } else {
 844     assert(format == imm_operand, "cannot specify a format");
 845     return;
 846   }
 847   assert(opnd == pc(), "must put operand where relocs can find it");
 848 }
 849 #endif // ASSERT
 850 
 851 void Assembler::emit_operand32(Register reg, Address adr) {
 852   assert(reg->encoding() < 8, "no extended registers");
 853   assert(!adr.base_needs_rex() && !adr.index_needs_rex(), "no extended registers");
 854   emit_operand(reg, adr._base, adr._index, adr._scale, adr._disp,
 855                adr._rspec);
 856 }
 857 
 858 void Assembler::emit_operand(Register reg, Address adr,
 859                              int rip_relative_correction) {
 860   emit_operand(reg, adr._base, adr._index, adr._scale, adr._disp,
 861                adr._rspec,
 862                rip_relative_correction);
 863 }
 864 
 865 void Assembler::emit_operand(XMMRegister reg, Address adr) {
 866   emit_operand(reg, adr._base, adr._index, adr._scale, adr._disp,
 867                adr._rspec);
 868 }
 869 
 870 // MMX operations
 871 void Assembler::emit_operand(MMXRegister reg, Address adr) {
 872   assert(!adr.base_needs_rex() && !adr.index_needs_rex(), "no extended registers");
 873   emit_operand((Register)reg, adr._base, adr._index, adr._scale, adr._disp, adr._rspec);
 874 }
 875 
 876 // work around gcc (3.2.1-7a) bug
 877 void Assembler::emit_operand(Address adr, MMXRegister reg) {
 878   assert(!adr.base_needs_rex() && !adr.index_needs_rex(), "no extended registers");
 879   emit_operand((Register)reg, adr._base, adr._index, adr._scale, adr._disp, adr._rspec);
 880 }
 881 
 882 
 883 void Assembler::emit_farith(int b1, int b2, int i) {
 884   assert(isByte(b1) && isByte(b2), "wrong opcode");
 885   assert(0 <= i &&  i < 8, "illegal stack offset");
 886   emit_byte(b1);
 887   emit_byte(b2 + i);
 888 }
 889 
 890 
 891 // Now the Assembler instructions (identical for 32/64 bits)
 892 
 893 void Assembler::adcl(Address dst, int32_t imm32) {
 894   InstructionMark im(this);
 895   prefix(dst);
 896   emit_arith_operand(0x81, rdx, dst, imm32);
 897 }
 898 
 899 void Assembler::adcl(Address dst, Register src) {
 900   InstructionMark im(this);
 901   prefix(dst, src);
 902   emit_byte(0x11);
 903   emit_operand(src, dst);
 904 }
 905 
 906 void Assembler::adcl(Register dst, int32_t imm32) {
 907   prefix(dst);
 908   emit_arith(0x81, 0xD0, dst, imm32);
 909 }
 910 
 911 void Assembler::adcl(Register dst, Address src) {
 912   InstructionMark im(this);
 913   prefix(src, dst);
 914   emit_byte(0x13);
 915   emit_operand(dst, src);
 916 }
 917 
 918 void Assembler::adcl(Register dst, Register src) {
 919   (void) prefix_and_encode(dst->encoding(), src->encoding());
 920   emit_arith(0x13, 0xC0, dst, src);
 921 }
 922 
 923 void Assembler::addl(Address dst, int32_t imm32) {
 924   InstructionMark im(this);
 925   prefix(dst);
 926   emit_arith_operand(0x81, rax, dst, imm32);
 927 }
 928 
 929 void Assembler::addl(Address dst, Register src) {
 930   InstructionMark im(this);
 931   prefix(dst, src);
 932   emit_byte(0x01);
 933   emit_operand(src, dst);
 934 }
 935 
 936 void Assembler::addl(Register dst, int32_t imm32) {
 937   prefix(dst);
 938   emit_arith(0x81, 0xC0, dst, imm32);
 939 }
 940 
 941 void Assembler::addl(Register dst, Address src) {
 942   InstructionMark im(this);
 943   prefix(src, dst);
 944   emit_byte(0x03);
 945   emit_operand(dst, src);
 946 }
 947 
 948 void Assembler::addl(Register dst, Register src) {
 949   (void) prefix_and_encode(dst->encoding(), src->encoding());
 950   emit_arith(0x03, 0xC0, dst, src);
 951 }
 952 
 953 void Assembler::addr_nop_4() {
 954   assert(UseAddressNop, "no CPU support");
 955   // 4 bytes: NOP DWORD PTR [EAX+0]
 956   emit_byte(0x0F);
 957   emit_byte(0x1F);
 958   emit_byte(0x40); // emit_rm(cbuf, 0x1, EAX_enc, EAX_enc);
 959   emit_byte(0);    // 8-bits offset (1 byte)
 960 }
 961 
 962 void Assembler::addr_nop_5() {
 963   assert(UseAddressNop, "no CPU support");
 964   // 5 bytes: NOP DWORD PTR [EAX+EAX*0+0] 8-bits offset
 965   emit_byte(0x0F);
 966   emit_byte(0x1F);
 967   emit_byte(0x44); // emit_rm(cbuf, 0x1, EAX_enc, 0x4);
 968   emit_byte(0x00); // emit_rm(cbuf, 0x0, EAX_enc, EAX_enc);
 969   emit_byte(0);    // 8-bits offset (1 byte)
 970 }
 971 
 972 void Assembler::addr_nop_7() {
 973   assert(UseAddressNop, "no CPU support");
 974   // 7 bytes: NOP DWORD PTR [EAX+0] 32-bits offset
 975   emit_byte(0x0F);
 976   emit_byte(0x1F);
 977   emit_byte(0x80); // emit_rm(cbuf, 0x2, EAX_enc, EAX_enc);
 978   emit_long(0);    // 32-bits offset (4 bytes)
 979 }
 980 
 981 void Assembler::addr_nop_8() {
 982   assert(UseAddressNop, "no CPU support");
 983   // 8 bytes: NOP DWORD PTR [EAX+EAX*0+0] 32-bits offset
 984   emit_byte(0x0F);
 985   emit_byte(0x1F);
 986   emit_byte(0x84); // emit_rm(cbuf, 0x2, EAX_enc, 0x4);
 987   emit_byte(0x00); // emit_rm(cbuf, 0x0, EAX_enc, EAX_enc);
 988   emit_long(0);    // 32-bits offset (4 bytes)
 989 }
 990 
 991 void Assembler::addsd(XMMRegister dst, XMMRegister src) {
 992   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
 993   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
 994   emit_byte(0x58);
 995   emit_byte(0xC0 | encode);
 996 }
 997 
 998 void Assembler::addsd(XMMRegister dst, Address src) {
 999   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1000   InstructionMark im(this);
1001   simd_prefix(dst, dst, src, VEX_SIMD_F2);
1002   emit_byte(0x58);
1003   emit_operand(dst, src);
1004 }
1005 
1006 void Assembler::addss(XMMRegister dst, XMMRegister src) {
1007   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1008   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
1009   emit_byte(0x58);
1010   emit_byte(0xC0 | encode);
1011 }
1012 
1013 void Assembler::addss(XMMRegister dst, Address src) {
1014   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1015   InstructionMark im(this);
1016   simd_prefix(dst, dst, src, VEX_SIMD_F3);
1017   emit_byte(0x58);
1018   emit_operand(dst, src);
1019 }
1020 
1021 void Assembler::andl(Address dst, int32_t imm32) {
1022   InstructionMark im(this);
1023   prefix(dst);
1024   emit_byte(0x81);
1025   emit_operand(rsp, dst, 4);
1026   emit_long(imm32);
1027 }
1028 
1029 void Assembler::andl(Register dst, int32_t imm32) {
1030   prefix(dst);
1031   emit_arith(0x81, 0xE0, dst, imm32);
1032 }
1033 
1034 void Assembler::andl(Register dst, Address src) {
1035   InstructionMark im(this);
1036   prefix(src, dst);
1037   emit_byte(0x23);
1038   emit_operand(dst, src);
1039 }
1040 
1041 void Assembler::andl(Register dst, Register src) {
1042   (void) prefix_and_encode(dst->encoding(), src->encoding());
1043   emit_arith(0x23, 0xC0, dst, src);
1044 }
1045 
1046 void Assembler::andpd(XMMRegister dst, Address src) {
1047   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1048   InstructionMark im(this);
1049   simd_prefix(dst, dst, src, VEX_SIMD_66);
1050   emit_byte(0x54);
1051   emit_operand(dst, src);
1052 }
1053 
1054 void Assembler::andpd(XMMRegister dst, XMMRegister src) {
1055   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1056   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
1057   emit_byte(0x54);
1058   emit_byte(0xC0 | encode);
1059 }
1060 
1061 void Assembler::andps(XMMRegister dst, Address src) {
1062   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1063   InstructionMark im(this);
1064   simd_prefix(dst, dst, src, VEX_SIMD_NONE);
1065   emit_byte(0x54);
1066   emit_operand(dst, src);
1067 }
1068 
1069 void Assembler::andps(XMMRegister dst, XMMRegister src) {
1070   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1071   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_NONE);
1072   emit_byte(0x54);
1073   emit_byte(0xC0 | encode);
1074 }
1075 
1076 void Assembler::bsfl(Register dst, Register src) {
1077   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1078   emit_byte(0x0F);
1079   emit_byte(0xBC);
1080   emit_byte(0xC0 | encode);
1081 }
1082 
1083 void Assembler::bsrl(Register dst, Register src) {
1084   assert(!VM_Version::supports_lzcnt(), "encoding is treated as LZCNT");
1085   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1086   emit_byte(0x0F);
1087   emit_byte(0xBD);
1088   emit_byte(0xC0 | encode);
1089 }
1090 
1091 void Assembler::bswapl(Register reg) { // bswap
1092   int encode = prefix_and_encode(reg->encoding());
1093   emit_byte(0x0F);
1094   emit_byte(0xC8 | encode);
1095 }
1096 
1097 void Assembler::call(Label& L, relocInfo::relocType rtype) {
1098   // suspect disp32 is always good
1099   int operand = LP64_ONLY(disp32_operand) NOT_LP64(imm_operand);
1100 
1101   if (L.is_bound()) {
1102     const int long_size = 5;
1103     int offs = (int)( target(L) - pc() );
1104     assert(offs <= 0, "assembler error");
1105     InstructionMark im(this);
1106     // 1110 1000 #32-bit disp
1107     emit_byte(0xE8);
1108     emit_data(offs - long_size, rtype, operand);
1109   } else {
1110     InstructionMark im(this);
1111     // 1110 1000 #32-bit disp
1112     L.add_patch_at(code(), locator());
1113 
1114     emit_byte(0xE8);
1115     emit_data(int(0), rtype, operand);
1116   }
1117 }
1118 
1119 void Assembler::call(Register dst) {
1120   int encode = prefix_and_encode(dst->encoding());
1121   emit_byte(0xFF);
1122   emit_byte(0xD0 | encode);
1123 }
1124 
1125 
1126 void Assembler::call(Address adr) {
1127   InstructionMark im(this);
1128   prefix(adr);
1129   emit_byte(0xFF);
1130   emit_operand(rdx, adr);
1131 }
1132 
1133 void Assembler::call_literal(address entry, RelocationHolder const& rspec) {
1134   assert(entry != NULL, "call most probably wrong");
1135   InstructionMark im(this);
1136   emit_byte(0xE8);
1137   intptr_t disp = entry - (_code_pos + sizeof(int32_t));
1138   assert(is_simm32(disp), "must be 32bit offset (call2)");
1139   // Technically, should use call32_operand, but this format is
1140   // implied by the fact that we're emitting a call instruction.
1141 
1142   int operand = LP64_ONLY(disp32_operand) NOT_LP64(call32_operand);
1143   emit_data((int) disp, rspec, operand);
1144 }
1145 
1146 void Assembler::cdql() {
1147   emit_byte(0x99);
1148 }
1149 
1150 void Assembler::cmovl(Condition cc, Register dst, Register src) {
1151   NOT_LP64(guarantee(VM_Version::supports_cmov(), "illegal instruction"));
1152   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1153   emit_byte(0x0F);
1154   emit_byte(0x40 | cc);
1155   emit_byte(0xC0 | encode);
1156 }
1157 
1158 
1159 void Assembler::cmovl(Condition cc, Register dst, Address src) {
1160   NOT_LP64(guarantee(VM_Version::supports_cmov(), "illegal instruction"));
1161   prefix(src, dst);
1162   emit_byte(0x0F);
1163   emit_byte(0x40 | cc);
1164   emit_operand(dst, src);
1165 }
1166 
1167 void Assembler::cmpb(Address dst, int imm8) {
1168   InstructionMark im(this);
1169   prefix(dst);
1170   emit_byte(0x80);
1171   emit_operand(rdi, dst, 1);
1172   emit_byte(imm8);
1173 }
1174 
1175 void Assembler::cmpl(Address dst, int32_t imm32) {
1176   InstructionMark im(this);
1177   prefix(dst);
1178   emit_byte(0x81);
1179   emit_operand(rdi, dst, 4);
1180   emit_long(imm32);
1181 }
1182 
1183 void Assembler::cmpl(Register dst, int32_t imm32) {
1184   prefix(dst);
1185   emit_arith(0x81, 0xF8, dst, imm32);
1186 }
1187 
1188 void Assembler::cmpl(Register dst, Register src) {
1189   (void) prefix_and_encode(dst->encoding(), src->encoding());
1190   emit_arith(0x3B, 0xC0, dst, src);
1191 }
1192 
1193 
1194 void Assembler::cmpl(Register dst, Address  src) {
1195   InstructionMark im(this);
1196   prefix(src, dst);
1197   emit_byte(0x3B);
1198   emit_operand(dst, src);
1199 }
1200 
1201 void Assembler::cmpw(Address dst, int imm16) {
1202   InstructionMark im(this);
1203   assert(!dst.base_needs_rex() && !dst.index_needs_rex(), "no extended registers");
1204   emit_byte(0x66);
1205   emit_byte(0x81);
1206   emit_operand(rdi, dst, 2);
1207   emit_word(imm16);
1208 }
1209 
1210 // The 32-bit cmpxchg compares the value at adr with the contents of rax,
1211 // and stores reg into adr if so; otherwise, the value at adr is loaded into rax,.
1212 // The ZF is set if the compared values were equal, and cleared otherwise.
1213 void Assembler::cmpxchgl(Register reg, Address adr) { // cmpxchg
1214   if (Atomics & 2) {
1215      // caveat: no instructionmark, so this isn't relocatable.
1216      // Emit a synthetic, non-atomic, CAS equivalent.
1217      // Beware.  The synthetic form sets all ICCs, not just ZF.
1218      // cmpxchg r,[m] is equivalent to rax, = CAS (m, rax, r)
1219      cmpl(rax, adr);
1220      movl(rax, adr);
1221      if (reg != rax) {
1222         Label L ;
1223         jcc(Assembler::notEqual, L);
1224         movl(adr, reg);
1225         bind(L);
1226      }
1227   } else {
1228      InstructionMark im(this);
1229      prefix(adr, reg);
1230      emit_byte(0x0F);
1231      emit_byte(0xB1);
1232      emit_operand(reg, adr);
1233   }
1234 }
1235 
1236 void Assembler::comisd(XMMRegister dst, Address src) {
1237   // NOTE: dbx seems to decode this as comiss even though the
1238   // 0x66 is there. Strangly ucomisd comes out correct
1239   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1240   InstructionMark im(this);
1241   simd_prefix(dst, src, VEX_SIMD_66);
1242   emit_byte(0x2F);
1243   emit_operand(dst, src);
1244 }
1245 
1246 void Assembler::comisd(XMMRegister dst, XMMRegister src) {
1247   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1248   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
1249   emit_byte(0x2F);
1250   emit_byte(0xC0 | encode);
1251 }
1252 
1253 void Assembler::comiss(XMMRegister dst, Address src) {
1254   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1255   InstructionMark im(this);
1256   simd_prefix(dst, src, VEX_SIMD_NONE);
1257   emit_byte(0x2F);
1258   emit_operand(dst, src);
1259 }
1260 
1261 void Assembler::comiss(XMMRegister dst, XMMRegister src) {
1262   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1263   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
1264   emit_byte(0x2F);
1265   emit_byte(0xC0 | encode);
1266 }
1267 
1268 void Assembler::cvtdq2pd(XMMRegister dst, XMMRegister src) {
1269   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1270   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3);
1271   emit_byte(0xE6);
1272   emit_byte(0xC0 | encode);
1273 }
1274 
1275 void Assembler::cvtdq2ps(XMMRegister dst, XMMRegister src) {
1276   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1277   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
1278   emit_byte(0x5B);
1279   emit_byte(0xC0 | encode);
1280 }
1281 
1282 void Assembler::cvtsd2ss(XMMRegister dst, XMMRegister src) {
1283   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1284   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
1285   emit_byte(0x5A);
1286   emit_byte(0xC0 | encode);
1287 }
1288 
1289 void Assembler::cvtsd2ss(XMMRegister dst, Address src) {
1290   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1291   InstructionMark im(this);
1292   simd_prefix(dst, dst, src, VEX_SIMD_F2);
1293   emit_byte(0x5A);
1294   emit_operand(dst, src);
1295 }
1296 
1297 void Assembler::cvtsi2sdl(XMMRegister dst, Register src) {
1298   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1299   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
1300   emit_byte(0x2A);
1301   emit_byte(0xC0 | encode);
1302 }
1303 
1304 void Assembler::cvtsi2sdl(XMMRegister dst, Address src) {
1305   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1306   InstructionMark im(this);
1307   simd_prefix(dst, dst, src, VEX_SIMD_F2);
1308   emit_byte(0x2A);
1309   emit_operand(dst, src);
1310 }
1311 
1312 void Assembler::cvtsi2ssl(XMMRegister dst, Register src) {
1313   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1314   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
1315   emit_byte(0x2A);
1316   emit_byte(0xC0 | encode);
1317 }
1318 
1319 void Assembler::cvtsi2ssl(XMMRegister dst, Address src) {
1320   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1321   InstructionMark im(this);
1322   simd_prefix(dst, dst, src, VEX_SIMD_F3);
1323   emit_byte(0x2A);
1324   emit_operand(dst, src);
1325 }
1326 
1327 void Assembler::cvtss2sd(XMMRegister dst, XMMRegister src) {
1328   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1329   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
1330   emit_byte(0x5A);
1331   emit_byte(0xC0 | encode);
1332 }
1333 
1334 void Assembler::cvtss2sd(XMMRegister dst, Address src) {
1335   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1336   InstructionMark im(this);
1337   simd_prefix(dst, dst, src, VEX_SIMD_F3);
1338   emit_byte(0x5A);
1339   emit_operand(dst, src);
1340 }
1341 
1342 
1343 void Assembler::cvttsd2sil(Register dst, XMMRegister src) {
1344   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1345   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2);
1346   emit_byte(0x2C);
1347   emit_byte(0xC0 | encode);
1348 }
1349 
1350 void Assembler::cvttss2sil(Register dst, XMMRegister src) {
1351   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1352   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3);
1353   emit_byte(0x2C);
1354   emit_byte(0xC0 | encode);
1355 }
1356 
1357 void Assembler::decl(Address dst) {
1358   // Don't use it directly. Use MacroAssembler::decrement() instead.
1359   InstructionMark im(this);
1360   prefix(dst);
1361   emit_byte(0xFF);
1362   emit_operand(rcx, dst);
1363 }
1364 
1365 void Assembler::divsd(XMMRegister dst, Address src) {
1366   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1367   InstructionMark im(this);
1368   simd_prefix(dst, dst, src, VEX_SIMD_F2);
1369   emit_byte(0x5E);
1370   emit_operand(dst, src);
1371 }
1372 
1373 void Assembler::divsd(XMMRegister dst, XMMRegister src) {
1374   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1375   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
1376   emit_byte(0x5E);
1377   emit_byte(0xC0 | encode);
1378 }
1379 
1380 void Assembler::divss(XMMRegister dst, Address src) {
1381   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1382   InstructionMark im(this);
1383   simd_prefix(dst, dst, src, VEX_SIMD_F3);
1384   emit_byte(0x5E);
1385   emit_operand(dst, src);
1386 }
1387 
1388 void Assembler::divss(XMMRegister dst, XMMRegister src) {
1389   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1390   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
1391   emit_byte(0x5E);
1392   emit_byte(0xC0 | encode);
1393 }
1394 
1395 void Assembler::emms() {
1396   NOT_LP64(assert(VM_Version::supports_mmx(), ""));
1397   emit_byte(0x0F);
1398   emit_byte(0x77);
1399 }
1400 
1401 void Assembler::hlt() {
1402   emit_byte(0xF4);
1403 }
1404 
1405 void Assembler::idivl(Register src) {
1406   int encode = prefix_and_encode(src->encoding());
1407   emit_byte(0xF7);
1408   emit_byte(0xF8 | encode);
1409 }
1410 
1411 void Assembler::divl(Register src) { // Unsigned
1412   int encode = prefix_and_encode(src->encoding());
1413   emit_byte(0xF7);
1414   emit_byte(0xF0 | encode);
1415 }
1416 
1417 void Assembler::imull(Register dst, Register src) {
1418   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1419   emit_byte(0x0F);
1420   emit_byte(0xAF);
1421   emit_byte(0xC0 | encode);
1422 }
1423 
1424 
1425 void Assembler::imull(Register dst, Register src, int value) {
1426   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1427   if (is8bit(value)) {
1428     emit_byte(0x6B);
1429     emit_byte(0xC0 | encode);
1430     emit_byte(value & 0xFF);
1431   } else {
1432     emit_byte(0x69);
1433     emit_byte(0xC0 | encode);
1434     emit_long(value);
1435   }
1436 }
1437 
1438 void Assembler::incl(Address dst) {
1439   // Don't use it directly. Use MacroAssembler::increment() instead.
1440   InstructionMark im(this);
1441   prefix(dst);
1442   emit_byte(0xFF);
1443   emit_operand(rax, dst);
1444 }
1445 
1446 void Assembler::jcc(Condition cc, Label& L, bool maybe_short) {
1447   InstructionMark im(this);
1448   assert((0 <= cc) && (cc < 16), "illegal cc");
1449   if (L.is_bound()) {
1450     address dst = target(L);
1451     assert(dst != NULL, "jcc most probably wrong");
1452 
1453     const int short_size = 2;
1454     const int long_size = 6;
1455     intptr_t offs = (intptr_t)dst - (intptr_t)_code_pos;
1456     if (maybe_short && is8bit(offs - short_size)) {
1457       // 0111 tttn #8-bit disp
1458       emit_byte(0x70 | cc);
1459       emit_byte((offs - short_size) & 0xFF);
1460     } else {
1461       // 0000 1111 1000 tttn #32-bit disp
1462       assert(is_simm32(offs - long_size),
1463              "must be 32bit offset (call4)");
1464       emit_byte(0x0F);
1465       emit_byte(0x80 | cc);
1466       emit_long(offs - long_size);
1467     }
1468   } else {
1469     // Note: could eliminate cond. jumps to this jump if condition
1470     //       is the same however, seems to be rather unlikely case.
1471     // Note: use jccb() if label to be bound is very close to get
1472     //       an 8-bit displacement
1473     L.add_patch_at(code(), locator());
1474     emit_byte(0x0F);
1475     emit_byte(0x80 | cc);
1476     emit_long(0);
1477   }
1478 }
1479 
1480 void Assembler::jccb(Condition cc, Label& L) {
1481   if (L.is_bound()) {
1482     const int short_size = 2;
1483     address entry = target(L);
1484 #ifdef ASSERT
1485     intptr_t dist = (intptr_t)entry - ((intptr_t)_code_pos + short_size);
1486     intptr_t delta = short_branch_delta();
1487     if (delta != 0) {
1488       dist += (dist < 0 ? (-delta) :delta);
1489     }
1490     assert(is8bit(dist), "Dispacement too large for a short jmp");
1491 #endif
1492     intptr_t offs = (intptr_t)entry - (intptr_t)_code_pos;
1493     // 0111 tttn #8-bit disp
1494     emit_byte(0x70 | cc);
1495     emit_byte((offs - short_size) & 0xFF);
1496   } else {
1497     InstructionMark im(this);
1498     L.add_patch_at(code(), locator());
1499     emit_byte(0x70 | cc);
1500     emit_byte(0);
1501   }
1502 }
1503 
1504 void Assembler::jmp(Address adr) {
1505   InstructionMark im(this);
1506   prefix(adr);
1507   emit_byte(0xFF);
1508   emit_operand(rsp, adr);
1509 }
1510 
1511 void Assembler::jmp(Label& L, bool maybe_short) {
1512   if (L.is_bound()) {
1513     address entry = target(L);
1514     assert(entry != NULL, "jmp most probably wrong");
1515     InstructionMark im(this);
1516     const int short_size = 2;
1517     const int long_size = 5;
1518     intptr_t offs = entry - _code_pos;
1519     if (maybe_short && is8bit(offs - short_size)) {
1520       emit_byte(0xEB);
1521       emit_byte((offs - short_size) & 0xFF);
1522     } else {
1523       emit_byte(0xE9);
1524       emit_long(offs - long_size);
1525     }
1526   } else {
1527     // By default, forward jumps are always 32-bit displacements, since
1528     // we can't yet know where the label will be bound.  If you're sure that
1529     // the forward jump will not run beyond 256 bytes, use jmpb to
1530     // force an 8-bit displacement.
1531     InstructionMark im(this);
1532     L.add_patch_at(code(), locator());
1533     emit_byte(0xE9);
1534     emit_long(0);
1535   }
1536 }
1537 
1538 void Assembler::jmp(Register entry) {
1539   int encode = prefix_and_encode(entry->encoding());
1540   emit_byte(0xFF);
1541   emit_byte(0xE0 | encode);
1542 }
1543 
1544 void Assembler::jmp_literal(address dest, RelocationHolder const& rspec) {
1545   InstructionMark im(this);
1546   emit_byte(0xE9);
1547   assert(dest != NULL, "must have a target");
1548   intptr_t disp = dest - (_code_pos + sizeof(int32_t));
1549   assert(is_simm32(disp), "must be 32bit offset (jmp)");
1550   emit_data(disp, rspec.reloc(), call32_operand);
1551 }
1552 
1553 void Assembler::jmpb(Label& L) {
1554   if (L.is_bound()) {
1555     const int short_size = 2;
1556     address entry = target(L);
1557     assert(entry != NULL, "jmp most probably wrong");
1558 #ifdef ASSERT
1559     intptr_t dist = (intptr_t)entry - ((intptr_t)_code_pos + short_size);
1560     intptr_t delta = short_branch_delta();
1561     if (delta != 0) {
1562       dist += (dist < 0 ? (-delta) :delta);
1563     }
1564     assert(is8bit(dist), "Dispacement too large for a short jmp");
1565 #endif
1566     intptr_t offs = entry - _code_pos;
1567     emit_byte(0xEB);
1568     emit_byte((offs - short_size) & 0xFF);
1569   } else {
1570     InstructionMark im(this);
1571     L.add_patch_at(code(), locator());
1572     emit_byte(0xEB);
1573     emit_byte(0);
1574   }
1575 }
1576 
1577 void Assembler::ldmxcsr( Address src) {
1578   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1579   InstructionMark im(this);
1580   prefix(src);
1581   emit_byte(0x0F);
1582   emit_byte(0xAE);
1583   emit_operand(as_Register(2), src);
1584 }
1585 
1586 void Assembler::leal(Register dst, Address src) {
1587   InstructionMark im(this);
1588 #ifdef _LP64
1589   emit_byte(0x67); // addr32
1590   prefix(src, dst);
1591 #endif // LP64
1592   emit_byte(0x8D);
1593   emit_operand(dst, src);
1594 }
1595 
1596 void Assembler::lock() {
1597   if (Atomics & 1) {
1598      // Emit either nothing, a NOP, or a NOP: prefix
1599      emit_byte(0x90) ;
1600   } else {
1601      emit_byte(0xF0);
1602   }
1603 }
1604 
1605 void Assembler::lzcntl(Register dst, Register src) {
1606   assert(VM_Version::supports_lzcnt(), "encoding is treated as BSR");
1607   emit_byte(0xF3);
1608   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1609   emit_byte(0x0F);
1610   emit_byte(0xBD);
1611   emit_byte(0xC0 | encode);
1612 }
1613 
1614 // Emit mfence instruction
1615 void Assembler::mfence() {
1616   NOT_LP64(assert(VM_Version::supports_sse2(), "unsupported");)
1617   emit_byte( 0x0F );
1618   emit_byte( 0xAE );
1619   emit_byte( 0xF0 );
1620 }
1621 
1622 void Assembler::mov(Register dst, Register src) {
1623   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
1624 }
1625 
1626 void Assembler::movapd(XMMRegister dst, XMMRegister src) {
1627   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1628   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
1629   emit_byte(0x28);
1630   emit_byte(0xC0 | encode);
1631 }
1632 
1633 void Assembler::movaps(XMMRegister dst, XMMRegister src) {
1634   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1635   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
1636   emit_byte(0x28);
1637   emit_byte(0xC0 | encode);
1638 }
1639 
1640 void Assembler::movlhps(XMMRegister dst, XMMRegister src) {
1641   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1642   int encode = simd_prefix_and_encode(dst, src, src, VEX_SIMD_NONE);
1643   emit_byte(0x16);
1644   emit_byte(0xC0 | encode);
1645 }
1646 
1647 void Assembler::movb(Register dst, Address src) {
1648   NOT_LP64(assert(dst->has_byte_register(), "must have byte register"));
1649   InstructionMark im(this);
1650   prefix(src, dst, true);
1651   emit_byte(0x8A);
1652   emit_operand(dst, src);
1653 }
1654 
1655 
1656 void Assembler::movb(Address dst, int imm8) {
1657   InstructionMark im(this);
1658    prefix(dst);
1659   emit_byte(0xC6);
1660   emit_operand(rax, dst, 1);
1661   emit_byte(imm8);
1662 }
1663 
1664 
1665 void Assembler::movb(Address dst, Register src) {
1666   assert(src->has_byte_register(), "must have byte register");
1667   InstructionMark im(this);
1668   prefix(dst, src, true);
1669   emit_byte(0x88);
1670   emit_operand(src, dst);
1671 }
1672 
1673 void Assembler::movdl(XMMRegister dst, Register src) {
1674   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1675   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
1676   emit_byte(0x6E);
1677   emit_byte(0xC0 | encode);
1678 }
1679 
1680 void Assembler::movdl(Register dst, XMMRegister src) {
1681   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1682   // swap src/dst to get correct prefix
1683   int encode = simd_prefix_and_encode(src, dst, VEX_SIMD_66);
1684   emit_byte(0x7E);
1685   emit_byte(0xC0 | encode);
1686 }
1687 
1688 void Assembler::movdl(XMMRegister dst, Address src) {
1689   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1690   InstructionMark im(this);
1691   simd_prefix(dst, src, VEX_SIMD_66);
1692   emit_byte(0x6E);
1693   emit_operand(dst, src);
1694 }
1695 
1696 void Assembler::movdl(Address dst, XMMRegister src) {
1697   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1698   InstructionMark im(this);
1699   simd_prefix(dst, src, VEX_SIMD_66);
1700   emit_byte(0x7E);
1701   emit_operand(src, dst);
1702 }
1703 
1704 void Assembler::movdqa(XMMRegister dst, XMMRegister src) {
1705   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1706   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
1707   emit_byte(0x6F);
1708   emit_byte(0xC0 | encode);
1709 }
1710 
1711 void Assembler::movdqu(XMMRegister dst, Address src) {
1712   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1713   InstructionMark im(this);
1714   simd_prefix(dst, src, VEX_SIMD_F3);
1715   emit_byte(0x6F);
1716   emit_operand(dst, src);
1717 }
1718 
1719 void Assembler::movdqu(XMMRegister dst, XMMRegister src) {
1720   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1721   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3);
1722   emit_byte(0x6F);
1723   emit_byte(0xC0 | encode);
1724 }
1725 
1726 void Assembler::movdqu(Address dst, XMMRegister src) {
1727   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1728   InstructionMark im(this);
1729   simd_prefix(dst, src, VEX_SIMD_F3);
1730   emit_byte(0x7F);
1731   emit_operand(src, dst);
1732 }
1733 
1734 // Move Unaligned 256bit Vector
1735 void Assembler::vmovdqu(XMMRegister dst, XMMRegister src) {
1736   assert(UseAVX, "");
1737   bool vector256 = true;
1738   int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F3, vector256);
1739   emit_byte(0x6F);
1740   emit_byte(0xC0 | encode);
1741 }
1742 
1743 void Assembler::vmovdqu(XMMRegister dst, Address src) {
1744   assert(UseAVX, "");
1745   InstructionMark im(this);
1746   bool vector256 = true;
1747   vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector256);
1748   emit_byte(0x6F);
1749   emit_operand(dst, src);
1750 }
1751 
1752 void Assembler::vmovdqu(Address dst, XMMRegister src) {
1753   assert(UseAVX, "");
1754   InstructionMark im(this);
1755   bool vector256 = true;
1756   // swap src<->dst for encoding
1757   assert(src != xnoreg, "sanity");
1758   vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector256);
1759   emit_byte(0x7F);
1760   emit_operand(src, dst);
1761 }
1762 
1763 // Uses zero extension on 64bit
1764 
1765 void Assembler::movl(Register dst, int32_t imm32) {
1766   int encode = prefix_and_encode(dst->encoding());
1767   emit_byte(0xB8 | encode);
1768   emit_long(imm32);
1769 }
1770 
1771 void Assembler::movl(Register dst, Register src) {
1772   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1773   emit_byte(0x8B);
1774   emit_byte(0xC0 | encode);
1775 }
1776 
1777 void Assembler::movl(Register dst, Address src) {
1778   InstructionMark im(this);
1779   prefix(src, dst);
1780   emit_byte(0x8B);
1781   emit_operand(dst, src);
1782 }
1783 
1784 void Assembler::movl(Address dst, int32_t imm32) {
1785   InstructionMark im(this);
1786   prefix(dst);
1787   emit_byte(0xC7);
1788   emit_operand(rax, dst, 4);
1789   emit_long(imm32);
1790 }
1791 
1792 void Assembler::movl(Address dst, Register src) {
1793   InstructionMark im(this);
1794   prefix(dst, src);
1795   emit_byte(0x89);
1796   emit_operand(src, dst);
1797 }
1798 
1799 // New cpus require to use movsd and movss to avoid partial register stall
1800 // when loading from memory. But for old Opteron use movlpd instead of movsd.
1801 // The selection is done in MacroAssembler::movdbl() and movflt().
1802 void Assembler::movlpd(XMMRegister dst, Address src) {
1803   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1804   InstructionMark im(this);
1805   simd_prefix(dst, dst, src, VEX_SIMD_66);
1806   emit_byte(0x12);
1807   emit_operand(dst, src);
1808 }
1809 
1810 void Assembler::movq( MMXRegister dst, Address src ) {
1811   assert( VM_Version::supports_mmx(), "" );
1812   emit_byte(0x0F);
1813   emit_byte(0x6F);
1814   emit_operand(dst, src);
1815 }
1816 
1817 void Assembler::movq( Address dst, MMXRegister src ) {
1818   assert( VM_Version::supports_mmx(), "" );
1819   emit_byte(0x0F);
1820   emit_byte(0x7F);
1821   // workaround gcc (3.2.1-7a) bug
1822   // In that version of gcc with only an emit_operand(MMX, Address)
1823   // gcc will tail jump and try and reverse the parameters completely
1824   // obliterating dst in the process. By having a version available
1825   // that doesn't need to swap the args at the tail jump the bug is
1826   // avoided.
1827   emit_operand(dst, src);
1828 }
1829 
1830 void Assembler::movq(XMMRegister dst, Address src) {
1831   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1832   InstructionMark im(this);
1833   simd_prefix(dst, src, VEX_SIMD_F3);
1834   emit_byte(0x7E);
1835   emit_operand(dst, src);
1836 }
1837 
1838 void Assembler::movq(Address dst, XMMRegister src) {
1839   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1840   InstructionMark im(this);
1841   simd_prefix(dst, src, VEX_SIMD_66);
1842   emit_byte(0xD6);
1843   emit_operand(src, dst);
1844 }
1845 
1846 void Assembler::movsbl(Register dst, Address src) { // movsxb
1847   InstructionMark im(this);
1848   prefix(src, dst);
1849   emit_byte(0x0F);
1850   emit_byte(0xBE);
1851   emit_operand(dst, src);
1852 }
1853 
1854 void Assembler::movsbl(Register dst, Register src) { // movsxb
1855   NOT_LP64(assert(src->has_byte_register(), "must have byte register"));
1856   int encode = prefix_and_encode(dst->encoding(), src->encoding(), true);
1857   emit_byte(0x0F);
1858   emit_byte(0xBE);
1859   emit_byte(0xC0 | encode);
1860 }
1861 
1862 void Assembler::movsd(XMMRegister dst, XMMRegister src) {
1863   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1864   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
1865   emit_byte(0x10);
1866   emit_byte(0xC0 | encode);
1867 }
1868 
1869 void Assembler::movsd(XMMRegister dst, Address src) {
1870   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1871   InstructionMark im(this);
1872   simd_prefix(dst, src, VEX_SIMD_F2);
1873   emit_byte(0x10);
1874   emit_operand(dst, src);
1875 }
1876 
1877 void Assembler::movsd(Address dst, XMMRegister src) {
1878   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1879   InstructionMark im(this);
1880   simd_prefix(dst, src, VEX_SIMD_F2);
1881   emit_byte(0x11);
1882   emit_operand(src, dst);
1883 }
1884 
1885 void Assembler::movss(XMMRegister dst, XMMRegister src) {
1886   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1887   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
1888   emit_byte(0x10);
1889   emit_byte(0xC0 | encode);
1890 }
1891 
1892 void Assembler::movss(XMMRegister dst, Address src) {
1893   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1894   InstructionMark im(this);
1895   simd_prefix(dst, src, VEX_SIMD_F3);
1896   emit_byte(0x10);
1897   emit_operand(dst, src);
1898 }
1899 
1900 void Assembler::movss(Address dst, XMMRegister src) {
1901   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1902   InstructionMark im(this);
1903   simd_prefix(dst, src, VEX_SIMD_F3);
1904   emit_byte(0x11);
1905   emit_operand(src, dst);
1906 }
1907 
1908 void Assembler::movswl(Register dst, Address src) { // movsxw
1909   InstructionMark im(this);
1910   prefix(src, dst);
1911   emit_byte(0x0F);
1912   emit_byte(0xBF);
1913   emit_operand(dst, src);
1914 }
1915 
1916 void Assembler::movswl(Register dst, Register src) { // movsxw
1917   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1918   emit_byte(0x0F);
1919   emit_byte(0xBF);
1920   emit_byte(0xC0 | encode);
1921 }
1922 
1923 void Assembler::movw(Address dst, int imm16) {
1924   InstructionMark im(this);
1925 
1926   emit_byte(0x66); // switch to 16-bit mode
1927   prefix(dst);
1928   emit_byte(0xC7);
1929   emit_operand(rax, dst, 2);
1930   emit_word(imm16);
1931 }
1932 
1933 void Assembler::movw(Register dst, Address src) {
1934   InstructionMark im(this);
1935   emit_byte(0x66);
1936   prefix(src, dst);
1937   emit_byte(0x8B);
1938   emit_operand(dst, src);
1939 }
1940 
1941 void Assembler::movw(Address dst, Register src) {
1942   InstructionMark im(this);
1943   emit_byte(0x66);
1944   prefix(dst, src);
1945   emit_byte(0x89);
1946   emit_operand(src, dst);
1947 }
1948 
1949 void Assembler::movzbl(Register dst, Address src) { // movzxb
1950   InstructionMark im(this);
1951   prefix(src, dst);
1952   emit_byte(0x0F);
1953   emit_byte(0xB6);
1954   emit_operand(dst, src);
1955 }
1956 
1957 void Assembler::movzbl(Register dst, Register src) { // movzxb
1958   NOT_LP64(assert(src->has_byte_register(), "must have byte register"));
1959   int encode = prefix_and_encode(dst->encoding(), src->encoding(), true);
1960   emit_byte(0x0F);
1961   emit_byte(0xB6);
1962   emit_byte(0xC0 | encode);
1963 }
1964 
1965 void Assembler::movzwl(Register dst, Address src) { // movzxw
1966   InstructionMark im(this);
1967   prefix(src, dst);
1968   emit_byte(0x0F);
1969   emit_byte(0xB7);
1970   emit_operand(dst, src);
1971 }
1972 
1973 void Assembler::movzwl(Register dst, Register src) { // movzxw
1974   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1975   emit_byte(0x0F);
1976   emit_byte(0xB7);
1977   emit_byte(0xC0 | encode);
1978 }
1979 
1980 void Assembler::mull(Address src) {
1981   InstructionMark im(this);
1982   prefix(src);
1983   emit_byte(0xF7);
1984   emit_operand(rsp, src);
1985 }
1986 
1987 void Assembler::mull(Register src) {
1988   int encode = prefix_and_encode(src->encoding());
1989   emit_byte(0xF7);
1990   emit_byte(0xE0 | encode);
1991 }
1992 
1993 void Assembler::mulsd(XMMRegister dst, Address src) {
1994   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1995   InstructionMark im(this);
1996   simd_prefix(dst, dst, src, VEX_SIMD_F2);
1997   emit_byte(0x59);
1998   emit_operand(dst, src);
1999 }
2000 
2001 void Assembler::mulsd(XMMRegister dst, XMMRegister src) {
2002   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2003   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
2004   emit_byte(0x59);
2005   emit_byte(0xC0 | encode);
2006 }
2007 
2008 void Assembler::mulss(XMMRegister dst, Address src) {
2009   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2010   InstructionMark im(this);
2011   simd_prefix(dst, dst, src, VEX_SIMD_F3);
2012   emit_byte(0x59);
2013   emit_operand(dst, src);
2014 }
2015 
2016 void Assembler::mulss(XMMRegister dst, XMMRegister src) {
2017   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2018   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
2019   emit_byte(0x59);
2020   emit_byte(0xC0 | encode);
2021 }
2022 
2023 void Assembler::negl(Register dst) {
2024   int encode = prefix_and_encode(dst->encoding());
2025   emit_byte(0xF7);
2026   emit_byte(0xD8 | encode);
2027 }
2028 
2029 void Assembler::nop(int i) {
2030 #ifdef ASSERT
2031   assert(i > 0, " ");
2032   // The fancy nops aren't currently recognized by debuggers making it a
2033   // pain to disassemble code while debugging. If asserts are on clearly
2034   // speed is not an issue so simply use the single byte traditional nop
2035   // to do alignment.
2036 
2037   for (; i > 0 ; i--) emit_byte(0x90);
2038   return;
2039 
2040 #endif // ASSERT
2041 
2042   if (UseAddressNop && VM_Version::is_intel()) {
2043     //
2044     // Using multi-bytes nops "0x0F 0x1F [address]" for Intel
2045     //  1: 0x90
2046     //  2: 0x66 0x90
2047     //  3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
2048     //  4: 0x0F 0x1F 0x40 0x00
2049     //  5: 0x0F 0x1F 0x44 0x00 0x00
2050     //  6: 0x66 0x0F 0x1F 0x44 0x00 0x00
2051     //  7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2052     //  8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2053     //  9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2054     // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2055     // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2056 
2057     // The rest coding is Intel specific - don't use consecutive address nops
2058 
2059     // 12: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
2060     // 13: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
2061     // 14: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
2062     // 15: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
2063 
2064     while(i >= 15) {
2065       // For Intel don't generate consecutive addess nops (mix with regular nops)
2066       i -= 15;
2067       emit_byte(0x66);   // size prefix
2068       emit_byte(0x66);   // size prefix
2069       emit_byte(0x66);   // size prefix
2070       addr_nop_8();
2071       emit_byte(0x66);   // size prefix
2072       emit_byte(0x66);   // size prefix
2073       emit_byte(0x66);   // size prefix
2074       emit_byte(0x90);   // nop
2075     }
2076     switch (i) {
2077       case 14:
2078         emit_byte(0x66); // size prefix
2079       case 13:
2080         emit_byte(0x66); // size prefix
2081       case 12:
2082         addr_nop_8();
2083         emit_byte(0x66); // size prefix
2084         emit_byte(0x66); // size prefix
2085         emit_byte(0x66); // size prefix
2086         emit_byte(0x90); // nop
2087         break;
2088       case 11:
2089         emit_byte(0x66); // size prefix
2090       case 10:
2091         emit_byte(0x66); // size prefix
2092       case 9:
2093         emit_byte(0x66); // size prefix
2094       case 8:
2095         addr_nop_8();
2096         break;
2097       case 7:
2098         addr_nop_7();
2099         break;
2100       case 6:
2101         emit_byte(0x66); // size prefix
2102       case 5:
2103         addr_nop_5();
2104         break;
2105       case 4:
2106         addr_nop_4();
2107         break;
2108       case 3:
2109         // Don't use "0x0F 0x1F 0x00" - need patching safe padding
2110         emit_byte(0x66); // size prefix
2111       case 2:
2112         emit_byte(0x66); // size prefix
2113       case 1:
2114         emit_byte(0x90); // nop
2115         break;
2116       default:
2117         assert(i == 0, " ");
2118     }
2119     return;
2120   }
2121   if (UseAddressNop && VM_Version::is_amd()) {
2122     //
2123     // Using multi-bytes nops "0x0F 0x1F [address]" for AMD.
2124     //  1: 0x90
2125     //  2: 0x66 0x90
2126     //  3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
2127     //  4: 0x0F 0x1F 0x40 0x00
2128     //  5: 0x0F 0x1F 0x44 0x00 0x00
2129     //  6: 0x66 0x0F 0x1F 0x44 0x00 0x00
2130     //  7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2131     //  8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2132     //  9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2133     // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2134     // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2135 
2136     // The rest coding is AMD specific - use consecutive address nops
2137 
2138     // 12: 0x66 0x0F 0x1F 0x44 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2139     // 13: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2140     // 14: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2141     // 15: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2142     // 16: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2143     //     Size prefixes (0x66) are added for larger sizes
2144 
2145     while(i >= 22) {
2146       i -= 11;
2147       emit_byte(0x66); // size prefix
2148       emit_byte(0x66); // size prefix
2149       emit_byte(0x66); // size prefix
2150       addr_nop_8();
2151     }
2152     // Generate first nop for size between 21-12
2153     switch (i) {
2154       case 21:
2155         i -= 1;
2156         emit_byte(0x66); // size prefix
2157       case 20:
2158       case 19:
2159         i -= 1;
2160         emit_byte(0x66); // size prefix
2161       case 18:
2162       case 17:
2163         i -= 1;
2164         emit_byte(0x66); // size prefix
2165       case 16:
2166       case 15:
2167         i -= 8;
2168         addr_nop_8();
2169         break;
2170       case 14:
2171       case 13:
2172         i -= 7;
2173         addr_nop_7();
2174         break;
2175       case 12:
2176         i -= 6;
2177         emit_byte(0x66); // size prefix
2178         addr_nop_5();
2179         break;
2180       default:
2181         assert(i < 12, " ");
2182     }
2183 
2184     // Generate second nop for size between 11-1
2185     switch (i) {
2186       case 11:
2187         emit_byte(0x66); // size prefix
2188       case 10:
2189         emit_byte(0x66); // size prefix
2190       case 9:
2191         emit_byte(0x66); // size prefix
2192       case 8:
2193         addr_nop_8();
2194         break;
2195       case 7:
2196         addr_nop_7();
2197         break;
2198       case 6:
2199         emit_byte(0x66); // size prefix
2200       case 5:
2201         addr_nop_5();
2202         break;
2203       case 4:
2204         addr_nop_4();
2205         break;
2206       case 3:
2207         // Don't use "0x0F 0x1F 0x00" - need patching safe padding
2208         emit_byte(0x66); // size prefix
2209       case 2:
2210         emit_byte(0x66); // size prefix
2211       case 1:
2212         emit_byte(0x90); // nop
2213         break;
2214       default:
2215         assert(i == 0, " ");
2216     }
2217     return;
2218   }
2219 
2220   // Using nops with size prefixes "0x66 0x90".
2221   // From AMD Optimization Guide:
2222   //  1: 0x90
2223   //  2: 0x66 0x90
2224   //  3: 0x66 0x66 0x90
2225   //  4: 0x66 0x66 0x66 0x90
2226   //  5: 0x66 0x66 0x90 0x66 0x90
2227   //  6: 0x66 0x66 0x90 0x66 0x66 0x90
2228   //  7: 0x66 0x66 0x66 0x90 0x66 0x66 0x90
2229   //  8: 0x66 0x66 0x66 0x90 0x66 0x66 0x66 0x90
2230   //  9: 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2231   // 10: 0x66 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2232   //
2233   while(i > 12) {
2234     i -= 4;
2235     emit_byte(0x66); // size prefix
2236     emit_byte(0x66);
2237     emit_byte(0x66);
2238     emit_byte(0x90); // nop
2239   }
2240   // 1 - 12 nops
2241   if(i > 8) {
2242     if(i > 9) {
2243       i -= 1;
2244       emit_byte(0x66);
2245     }
2246     i -= 3;
2247     emit_byte(0x66);
2248     emit_byte(0x66);
2249     emit_byte(0x90);
2250   }
2251   // 1 - 8 nops
2252   if(i > 4) {
2253     if(i > 6) {
2254       i -= 1;
2255       emit_byte(0x66);
2256     }
2257     i -= 3;
2258     emit_byte(0x66);
2259     emit_byte(0x66);
2260     emit_byte(0x90);
2261   }
2262   switch (i) {
2263     case 4:
2264       emit_byte(0x66);
2265     case 3:
2266       emit_byte(0x66);
2267     case 2:
2268       emit_byte(0x66);
2269     case 1:
2270       emit_byte(0x90);
2271       break;
2272     default:
2273       assert(i == 0, " ");
2274   }
2275 }
2276 
2277 void Assembler::notl(Register dst) {
2278   int encode = prefix_and_encode(dst->encoding());
2279   emit_byte(0xF7);
2280   emit_byte(0xD0 | encode );
2281 }
2282 
2283 void Assembler::orl(Address dst, int32_t imm32) {
2284   InstructionMark im(this);
2285   prefix(dst);
2286   emit_arith_operand(0x81, rcx, dst, imm32);
2287 }
2288 
2289 void Assembler::orl(Register dst, int32_t imm32) {
2290   prefix(dst);
2291   emit_arith(0x81, 0xC8, dst, imm32);
2292 }
2293 
2294 void Assembler::orl(Register dst, Address src) {
2295   InstructionMark im(this);
2296   prefix(src, dst);
2297   emit_byte(0x0B);
2298   emit_operand(dst, src);
2299 }
2300 
2301 void Assembler::orl(Register dst, Register src) {
2302   (void) prefix_and_encode(dst->encoding(), src->encoding());
2303   emit_arith(0x0B, 0xC0, dst, src);
2304 }
2305 
2306 void Assembler::packuswb(XMMRegister dst, Address src) {
2307   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2308   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2309   InstructionMark im(this);
2310   simd_prefix(dst, dst, src, VEX_SIMD_66);
2311   emit_byte(0x67);
2312   emit_operand(dst, src);
2313 }
2314 
2315 void Assembler::packuswb(XMMRegister dst, XMMRegister src) {
2316   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2317   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
2318   emit_byte(0x67);
2319   emit_byte(0xC0 | encode);
2320 }
2321 
2322 void Assembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
2323   assert(VM_Version::supports_sse4_2(), "");
2324   InstructionMark im(this);
2325   simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
2326   emit_byte(0x61);
2327   emit_operand(dst, src);
2328   emit_byte(imm8);
2329 }
2330 
2331 void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
2332   assert(VM_Version::supports_sse4_2(), "");
2333   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
2334   emit_byte(0x61);
2335   emit_byte(0xC0 | encode);
2336   emit_byte(imm8);
2337 }
2338 
2339 void Assembler::pmovzxbw(XMMRegister dst, Address src) {
2340   assert(VM_Version::supports_sse4_1(), "");
2341   InstructionMark im(this);
2342   simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
2343   emit_byte(0x30);
2344   emit_operand(dst, src);
2345 }
2346 
2347 void Assembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
2348   assert(VM_Version::supports_sse4_1(), "");
2349   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
2350   emit_byte(0x30);
2351   emit_byte(0xC0 | encode);
2352 }
2353 
2354 // generic
2355 void Assembler::pop(Register dst) {
2356   int encode = prefix_and_encode(dst->encoding());
2357   emit_byte(0x58 | encode);
2358 }
2359 
2360 void Assembler::popcntl(Register dst, Address src) {
2361   assert(VM_Version::supports_popcnt(), "must support");
2362   InstructionMark im(this);
2363   emit_byte(0xF3);
2364   prefix(src, dst);
2365   emit_byte(0x0F);
2366   emit_byte(0xB8);
2367   emit_operand(dst, src);
2368 }
2369 
2370 void Assembler::popcntl(Register dst, Register src) {
2371   assert(VM_Version::supports_popcnt(), "must support");
2372   emit_byte(0xF3);
2373   int encode = prefix_and_encode(dst->encoding(), src->encoding());
2374   emit_byte(0x0F);
2375   emit_byte(0xB8);
2376   emit_byte(0xC0 | encode);
2377 }
2378 
2379 void Assembler::popf() {
2380   emit_byte(0x9D);
2381 }
2382 
2383 #ifndef _LP64 // no 32bit push/pop on amd64
2384 void Assembler::popl(Address dst) {
2385   // NOTE: this will adjust stack by 8byte on 64bits
2386   InstructionMark im(this);
2387   prefix(dst);
2388   emit_byte(0x8F);
2389   emit_operand(rax, dst);
2390 }
2391 #endif
2392 
2393 void Assembler::prefetch_prefix(Address src) {
2394   prefix(src);
2395   emit_byte(0x0F);
2396 }
2397 
2398 void Assembler::prefetchnta(Address src) {
2399   NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2400   InstructionMark im(this);
2401   prefetch_prefix(src);
2402   emit_byte(0x18);
2403   emit_operand(rax, src); // 0, src
2404 }
2405 
2406 void Assembler::prefetchr(Address src) {
2407   assert(VM_Version::supports_3dnow_prefetch(), "must support");
2408   InstructionMark im(this);
2409   prefetch_prefix(src);
2410   emit_byte(0x0D);
2411   emit_operand(rax, src); // 0, src
2412 }
2413 
2414 void Assembler::prefetcht0(Address src) {
2415   NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2416   InstructionMark im(this);
2417   prefetch_prefix(src);
2418   emit_byte(0x18);
2419   emit_operand(rcx, src); // 1, src
2420 }
2421 
2422 void Assembler::prefetcht1(Address src) {
2423   NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2424   InstructionMark im(this);
2425   prefetch_prefix(src);
2426   emit_byte(0x18);
2427   emit_operand(rdx, src); // 2, src
2428 }
2429 
2430 void Assembler::prefetcht2(Address src) {
2431   NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2432   InstructionMark im(this);
2433   prefetch_prefix(src);
2434   emit_byte(0x18);
2435   emit_operand(rbx, src); // 3, src
2436 }
2437 
2438 void Assembler::prefetchw(Address src) {
2439   assert(VM_Version::supports_3dnow_prefetch(), "must support");
2440   InstructionMark im(this);
2441   prefetch_prefix(src);
2442   emit_byte(0x0D);
2443   emit_operand(rcx, src); // 1, src
2444 }
2445 
2446 void Assembler::prefix(Prefix p) {
2447   a_byte(p);
2448 }
2449 
2450 void Assembler::por(XMMRegister dst, XMMRegister src) {
2451   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2452   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
2453   emit_byte(0xEB);
2454   emit_byte(0xC0 | encode);
2455 }
2456 
2457 void Assembler::por(XMMRegister dst, Address src) {
2458   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2459   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2460   InstructionMark im(this);
2461   simd_prefix(dst, dst, src, VEX_SIMD_66);
2462   emit_byte(0xEB);
2463   emit_operand(dst, src);
2464 }
2465 
2466 void Assembler::pshufd(XMMRegister dst, XMMRegister src, int mode) {
2467   assert(isByte(mode), "invalid value");
2468   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2469   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
2470   emit_byte(0x70);
2471   emit_byte(0xC0 | encode);
2472   emit_byte(mode & 0xFF);
2473 
2474 }
2475 
2476 void Assembler::pshufd(XMMRegister dst, Address src, int mode) {
2477   assert(isByte(mode), "invalid value");
2478   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2479   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2480   InstructionMark im(this);
2481   simd_prefix(dst, src, VEX_SIMD_66);
2482   emit_byte(0x70);
2483   emit_operand(dst, src);
2484   emit_byte(mode & 0xFF);
2485 }
2486 
2487 void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
2488   assert(isByte(mode), "invalid value");
2489   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2490   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2);
2491   emit_byte(0x70);
2492   emit_byte(0xC0 | encode);
2493   emit_byte(mode & 0xFF);
2494 }
2495 
2496 void Assembler::pshuflw(XMMRegister dst, Address src, int mode) {
2497   assert(isByte(mode), "invalid value");
2498   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2499   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2500   InstructionMark im(this);
2501   simd_prefix(dst, src, VEX_SIMD_F2);
2502   emit_byte(0x70);
2503   emit_operand(dst, src);
2504   emit_byte(mode & 0xFF);
2505 }
2506 
2507 void Assembler::psrlq(XMMRegister dst, int shift) {
2508   // Shift 64 bit value logically right by specified number of bits.
2509   // HMM Table D-1 says sse2 or mmx.
2510   // Do not confuse it with psrldq SSE2 instruction which
2511   // shifts 128 bit value in xmm register by number of bytes.
2512   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2513   int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66);
2514   emit_byte(0x73);
2515   emit_byte(0xC0 | encode);
2516   emit_byte(shift);
2517 }
2518 
2519 void Assembler::psrldq(XMMRegister dst, int shift) {
2520   // Shift 128 bit value in xmm register by number of bytes.
2521   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2522   int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66);
2523   emit_byte(0x73);
2524   emit_byte(0xC0 | encode);
2525   emit_byte(shift);
2526 }
2527 
2528 void Assembler::ptest(XMMRegister dst, Address src) {
2529   assert(VM_Version::supports_sse4_1(), "");
2530   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2531   InstructionMark im(this);
2532   simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
2533   emit_byte(0x17);
2534   emit_operand(dst, src);
2535 }
2536 
2537 void Assembler::ptest(XMMRegister dst, XMMRegister src) {
2538   assert(VM_Version::supports_sse4_1(), "");
2539   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
2540   emit_byte(0x17);
2541   emit_byte(0xC0 | encode);
2542 }
2543 
2544 void Assembler::punpcklbw(XMMRegister dst, Address src) {
2545   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2546   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2547   InstructionMark im(this);
2548   simd_prefix(dst, dst, src, VEX_SIMD_66);
2549   emit_byte(0x60);
2550   emit_operand(dst, src);
2551 }
2552 
2553 void Assembler::punpcklbw(XMMRegister dst, XMMRegister src) {
2554   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2555   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
2556   emit_byte(0x60);
2557   emit_byte(0xC0 | encode);
2558 }
2559 
2560 void Assembler::punpckldq(XMMRegister dst, Address src) {
2561   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2562   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2563   InstructionMark im(this);
2564   simd_prefix(dst, dst, src, VEX_SIMD_66);
2565   emit_byte(0x62);
2566   emit_operand(dst, src);
2567 }
2568 
2569 void Assembler::punpckldq(XMMRegister dst, XMMRegister src) {
2570   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2571   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
2572   emit_byte(0x62);
2573   emit_byte(0xC0 | encode);
2574 }
2575 
2576 void Assembler::push(int32_t imm32) {
2577   // in 64bits we push 64bits onto the stack but only
2578   // take a 32bit immediate
2579   emit_byte(0x68);
2580   emit_long(imm32);
2581 }
2582 
2583 void Assembler::push(Register src) {
2584   int encode = prefix_and_encode(src->encoding());
2585 
2586   emit_byte(0x50 | encode);
2587 }
2588 
2589 void Assembler::pushf() {
2590   emit_byte(0x9C);
2591 }
2592 
2593 #ifndef _LP64 // no 32bit push/pop on amd64
2594 void Assembler::pushl(Address src) {
2595   // Note this will push 64bit on 64bit
2596   InstructionMark im(this);
2597   prefix(src);
2598   emit_byte(0xFF);
2599   emit_operand(rsi, src);
2600 }
2601 #endif
2602 
2603 void Assembler::pxor(XMMRegister dst, Address src) {
2604   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2605   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2606   InstructionMark im(this);
2607   simd_prefix(dst, dst, src, VEX_SIMD_66);
2608   emit_byte(0xEF);
2609   emit_operand(dst, src);
2610 }
2611 
2612 void Assembler::pxor(XMMRegister dst, XMMRegister src) {
2613   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2614   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
2615   emit_byte(0xEF);
2616   emit_byte(0xC0 | encode);
2617 }
2618 
2619 void Assembler::rcll(Register dst, int imm8) {
2620   assert(isShiftCount(imm8), "illegal shift count");
2621   int encode = prefix_and_encode(dst->encoding());
2622   if (imm8 == 1) {
2623     emit_byte(0xD1);
2624     emit_byte(0xD0 | encode);
2625   } else {
2626     emit_byte(0xC1);
2627     emit_byte(0xD0 | encode);
2628     emit_byte(imm8);
2629   }
2630 }
2631 
2632 // copies data from [esi] to [edi] using rcx pointer sized words
2633 // generic
2634 void Assembler::rep_mov() {
2635   emit_byte(0xF3);
2636   // MOVSQ
2637   LP64_ONLY(prefix(REX_W));
2638   emit_byte(0xA5);
2639 }
2640 
2641 // sets rcx pointer sized words with rax, value at [edi]
2642 // generic
2643 void Assembler::rep_set() { // rep_set
2644   emit_byte(0xF3);
2645   // STOSQ
2646   LP64_ONLY(prefix(REX_W));
2647   emit_byte(0xAB);
2648 }
2649 
2650 // scans rcx pointer sized words at [edi] for occurance of rax,
2651 // generic
2652 void Assembler::repne_scan() { // repne_scan
2653   emit_byte(0xF2);
2654   // SCASQ
2655   LP64_ONLY(prefix(REX_W));
2656   emit_byte(0xAF);
2657 }
2658 
2659 #ifdef _LP64
2660 // scans rcx 4 byte words at [edi] for occurance of rax,
2661 // generic
2662 void Assembler::repne_scanl() { // repne_scan
2663   emit_byte(0xF2);
2664   // SCASL
2665   emit_byte(0xAF);
2666 }
2667 #endif
2668 
2669 void Assembler::ret(int imm16) {
2670   if (imm16 == 0) {
2671     emit_byte(0xC3);
2672   } else {
2673     emit_byte(0xC2);
2674     emit_word(imm16);
2675   }
2676 }
2677 
2678 void Assembler::sahf() {
2679 #ifdef _LP64
2680   // Not supported in 64bit mode
2681   ShouldNotReachHere();
2682 #endif
2683   emit_byte(0x9E);
2684 }
2685 
2686 void Assembler::sarl(Register dst, int imm8) {
2687   int encode = prefix_and_encode(dst->encoding());
2688   assert(isShiftCount(imm8), "illegal shift count");
2689   if (imm8 == 1) {
2690     emit_byte(0xD1);
2691     emit_byte(0xF8 | encode);
2692   } else {
2693     emit_byte(0xC1);
2694     emit_byte(0xF8 | encode);
2695     emit_byte(imm8);
2696   }
2697 }
2698 
2699 void Assembler::sarl(Register dst) {
2700   int encode = prefix_and_encode(dst->encoding());
2701   emit_byte(0xD3);
2702   emit_byte(0xF8 | encode);
2703 }
2704 
2705 void Assembler::sbbl(Address dst, int32_t imm32) {
2706   InstructionMark im(this);
2707   prefix(dst);
2708   emit_arith_operand(0x81, rbx, dst, imm32);
2709 }
2710 
2711 void Assembler::sbbl(Register dst, int32_t imm32) {
2712   prefix(dst);
2713   emit_arith(0x81, 0xD8, dst, imm32);
2714 }
2715 
2716 
2717 void Assembler::sbbl(Register dst, Address src) {
2718   InstructionMark im(this);
2719   prefix(src, dst);
2720   emit_byte(0x1B);
2721   emit_operand(dst, src);
2722 }
2723 
2724 void Assembler::sbbl(Register dst, Register src) {
2725   (void) prefix_and_encode(dst->encoding(), src->encoding());
2726   emit_arith(0x1B, 0xC0, dst, src);
2727 }
2728 
2729 void Assembler::setb(Condition cc, Register dst) {
2730   assert(0 <= cc && cc < 16, "illegal cc");
2731   int encode = prefix_and_encode(dst->encoding(), true);
2732   emit_byte(0x0F);
2733   emit_byte(0x90 | cc);
2734   emit_byte(0xC0 | encode);
2735 }
2736 
2737 void Assembler::shll(Register dst, int imm8) {
2738   assert(isShiftCount(imm8), "illegal shift count");
2739   int encode = prefix_and_encode(dst->encoding());
2740   if (imm8 == 1 ) {
2741     emit_byte(0xD1);
2742     emit_byte(0xE0 | encode);
2743   } else {
2744     emit_byte(0xC1);
2745     emit_byte(0xE0 | encode);
2746     emit_byte(imm8);
2747   }
2748 }
2749 
2750 void Assembler::shll(Register dst) {
2751   int encode = prefix_and_encode(dst->encoding());
2752   emit_byte(0xD3);
2753   emit_byte(0xE0 | encode);
2754 }
2755 
2756 void Assembler::shrl(Register dst, int imm8) {
2757   assert(isShiftCount(imm8), "illegal shift count");
2758   int encode = prefix_and_encode(dst->encoding());
2759   emit_byte(0xC1);
2760   emit_byte(0xE8 | encode);
2761   emit_byte(imm8);
2762 }
2763 
2764 void Assembler::shrl(Register dst) {
2765   int encode = prefix_and_encode(dst->encoding());
2766   emit_byte(0xD3);
2767   emit_byte(0xE8 | encode);
2768 }
2769 
2770 // copies a single word from [esi] to [edi]
2771 void Assembler::smovl() {
2772   emit_byte(0xA5);
2773 }
2774 
2775 void Assembler::sqrtsd(XMMRegister dst, XMMRegister src) {
2776   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2777   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
2778   emit_byte(0x51);
2779   emit_byte(0xC0 | encode);
2780 }
2781 
2782 void Assembler::sqrtsd(XMMRegister dst, Address src) {
2783   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2784   InstructionMark im(this);
2785   simd_prefix(dst, dst, src, VEX_SIMD_F2);
2786   emit_byte(0x51);
2787   emit_operand(dst, src);
2788 }
2789 
2790 void Assembler::sqrtss(XMMRegister dst, XMMRegister src) {
2791   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2792   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
2793   emit_byte(0x51);
2794   emit_byte(0xC0 | encode);
2795 }
2796 
2797 void Assembler::sqrtss(XMMRegister dst, Address src) {
2798   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2799   InstructionMark im(this);
2800   simd_prefix(dst, dst, src, VEX_SIMD_F3);
2801   emit_byte(0x51);
2802   emit_operand(dst, src);
2803 }
2804 
2805 void Assembler::stmxcsr( Address dst) {
2806   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2807   InstructionMark im(this);
2808   prefix(dst);
2809   emit_byte(0x0F);
2810   emit_byte(0xAE);
2811   emit_operand(as_Register(3), dst);
2812 }
2813 
2814 void Assembler::subl(Address dst, int32_t imm32) {
2815   InstructionMark im(this);
2816   prefix(dst);
2817   emit_arith_operand(0x81, rbp, dst, imm32);
2818 }
2819 
2820 void Assembler::subl(Address dst, Register src) {
2821   InstructionMark im(this);
2822   prefix(dst, src);
2823   emit_byte(0x29);
2824   emit_operand(src, dst);
2825 }
2826 
2827 void Assembler::subl(Register dst, int32_t imm32) {
2828   prefix(dst);
2829   emit_arith(0x81, 0xE8, dst, imm32);
2830 }
2831 
2832 // Force generation of a 4 byte immediate value even if it fits into 8bit
2833 void Assembler::subl_imm32(Register dst, int32_t imm32) {
2834   prefix(dst);
2835   emit_arith_imm32(0x81, 0xE8, dst, imm32);
2836 }
2837 
2838 void Assembler::subl(Register dst, Address src) {
2839   InstructionMark im(this);
2840   prefix(src, dst);
2841   emit_byte(0x2B);
2842   emit_operand(dst, src);
2843 }
2844 
2845 void Assembler::subl(Register dst, Register src) {
2846   (void) prefix_and_encode(dst->encoding(), src->encoding());
2847   emit_arith(0x2B, 0xC0, dst, src);
2848 }
2849 
2850 void Assembler::subsd(XMMRegister dst, XMMRegister src) {
2851   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2852   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
2853   emit_byte(0x5C);
2854   emit_byte(0xC0 | encode);
2855 }
2856 
2857 void Assembler::subsd(XMMRegister dst, Address src) {
2858   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2859   InstructionMark im(this);
2860   simd_prefix(dst, dst, src, VEX_SIMD_F2);
2861   emit_byte(0x5C);
2862   emit_operand(dst, src);
2863 }
2864 
2865 void Assembler::subss(XMMRegister dst, XMMRegister src) {
2866   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2867   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
2868   emit_byte(0x5C);
2869   emit_byte(0xC0 | encode);
2870 }
2871 
2872 void Assembler::subss(XMMRegister dst, Address src) {
2873   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2874   InstructionMark im(this);
2875   simd_prefix(dst, dst, src, VEX_SIMD_F3);
2876   emit_byte(0x5C);
2877   emit_operand(dst, src);
2878 }
2879 
2880 void Assembler::testb(Register dst, int imm8) {
2881   NOT_LP64(assert(dst->has_byte_register(), "must have byte register"));
2882   (void) prefix_and_encode(dst->encoding(), true);
2883   emit_arith_b(0xF6, 0xC0, dst, imm8);
2884 }
2885 
2886 void Assembler::testl(Register dst, int32_t imm32) {
2887   // not using emit_arith because test
2888   // doesn't support sign-extension of
2889   // 8bit operands
2890   int encode = dst->encoding();
2891   if (encode == 0) {
2892     emit_byte(0xA9);
2893   } else {
2894     encode = prefix_and_encode(encode);
2895     emit_byte(0xF7);
2896     emit_byte(0xC0 | encode);
2897   }
2898   emit_long(imm32);
2899 }
2900 
2901 void Assembler::testl(Register dst, Register src) {
2902   (void) prefix_and_encode(dst->encoding(), src->encoding());
2903   emit_arith(0x85, 0xC0, dst, src);
2904 }
2905 
2906 void Assembler::testl(Register dst, Address  src) {
2907   InstructionMark im(this);
2908   prefix(src, dst);
2909   emit_byte(0x85);
2910   emit_operand(dst, src);
2911 }
2912 
2913 void Assembler::ucomisd(XMMRegister dst, Address src) {
2914   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2915   InstructionMark im(this);
2916   simd_prefix(dst, src, VEX_SIMD_66);
2917   emit_byte(0x2E);
2918   emit_operand(dst, src);
2919 }
2920 
2921 void Assembler::ucomisd(XMMRegister dst, XMMRegister src) {
2922   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2923   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
2924   emit_byte(0x2E);
2925   emit_byte(0xC0 | encode);
2926 }
2927 
2928 void Assembler::ucomiss(XMMRegister dst, Address src) {
2929   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2930   InstructionMark im(this);
2931   simd_prefix(dst, src, VEX_SIMD_NONE);
2932   emit_byte(0x2E);
2933   emit_operand(dst, src);
2934 }
2935 
2936 void Assembler::ucomiss(XMMRegister dst, XMMRegister src) {
2937   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2938   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_NONE);
2939   emit_byte(0x2E);
2940   emit_byte(0xC0 | encode);
2941 }
2942 
2943 
2944 void Assembler::xaddl(Address dst, Register src) {
2945   InstructionMark im(this);
2946   prefix(dst, src);
2947   emit_byte(0x0F);
2948   emit_byte(0xC1);
2949   emit_operand(src, dst);
2950 }
2951 
2952 void Assembler::xchgl(Register dst, Address src) { // xchg
2953   InstructionMark im(this);
2954   prefix(src, dst);
2955   emit_byte(0x87);
2956   emit_operand(dst, src);
2957 }
2958 
2959 void Assembler::xchgl(Register dst, Register src) {
2960   int encode = prefix_and_encode(dst->encoding(), src->encoding());
2961   emit_byte(0x87);
2962   emit_byte(0xc0 | encode);
2963 }
2964 
2965 void Assembler::xorl(Register dst, int32_t imm32) {
2966   prefix(dst);
2967   emit_arith(0x81, 0xF0, dst, imm32);
2968 }
2969 
2970 void Assembler::xorl(Register dst, Address src) {
2971   InstructionMark im(this);
2972   prefix(src, dst);
2973   emit_byte(0x33);
2974   emit_operand(dst, src);
2975 }
2976 
2977 void Assembler::xorl(Register dst, Register src) {
2978   (void) prefix_and_encode(dst->encoding(), src->encoding());
2979   emit_arith(0x33, 0xC0, dst, src);
2980 }
2981 
2982 void Assembler::xorpd(XMMRegister dst, XMMRegister src) {
2983   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2984   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66);
2985   emit_byte(0x57);
2986   emit_byte(0xC0 | encode);
2987 }
2988 
2989 void Assembler::xorpd(XMMRegister dst, Address src) {
2990   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2991   InstructionMark im(this);
2992   simd_prefix(dst, dst, src, VEX_SIMD_66);
2993   emit_byte(0x57);
2994   emit_operand(dst, src);
2995 }
2996 
2997 
2998 void Assembler::xorps(XMMRegister dst, XMMRegister src) {
2999   NOT_LP64(assert(VM_Version::supports_sse(), ""));
3000   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_NONE);
3001   emit_byte(0x57);
3002   emit_byte(0xC0 | encode);
3003 }
3004 
3005 void Assembler::xorps(XMMRegister dst, Address src) {
3006   NOT_LP64(assert(VM_Version::supports_sse(), ""));
3007   InstructionMark im(this);
3008   simd_prefix(dst, dst, src, VEX_SIMD_NONE);
3009   emit_byte(0x57);
3010   emit_operand(dst, src);
3011 }
3012 
3013 // AVX 3-operands non destructive source instructions (encoded with VEX prefix)
3014 
3015 void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, Address src) {
3016   assert(VM_Version::supports_avx(), "");
3017   InstructionMark im(this);
3018   vex_prefix(dst, nds, src, VEX_SIMD_F2);
3019   emit_byte(0x58);
3020   emit_operand(dst, src);
3021 }
3022 
3023 void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3024   assert(VM_Version::supports_avx(), "");
3025   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
3026   emit_byte(0x58);
3027   emit_byte(0xC0 | encode);
3028 }
3029 
3030 void Assembler::vaddss(XMMRegister dst, XMMRegister nds, Address src) {
3031   assert(VM_Version::supports_avx(), "");
3032   InstructionMark im(this);
3033   vex_prefix(dst, nds, src, VEX_SIMD_F3);
3034   emit_byte(0x58);
3035   emit_operand(dst, src);
3036 }
3037 
3038 void Assembler::vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3039   assert(VM_Version::supports_avx(), "");
3040   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
3041   emit_byte(0x58);
3042   emit_byte(0xC0 | encode);
3043 }
3044 
3045 void Assembler::vandpd(XMMRegister dst, XMMRegister nds, Address src) {
3046   assert(VM_Version::supports_avx(), "");
3047   InstructionMark im(this);
3048   vex_prefix(dst, nds, src, VEX_SIMD_66); // 128-bit vector
3049   emit_byte(0x54);
3050   emit_operand(dst, src);
3051 }
3052 
3053 void Assembler::vandps(XMMRegister dst, XMMRegister nds, Address src) {
3054   assert(VM_Version::supports_avx(), "");
3055   InstructionMark im(this);
3056   vex_prefix(dst, nds, src, VEX_SIMD_NONE); // 128-bit vector
3057   emit_byte(0x54);
3058   emit_operand(dst, src);
3059 }
3060 
3061 void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, Address src) {
3062   assert(VM_Version::supports_avx(), "");
3063   InstructionMark im(this);
3064   vex_prefix(dst, nds, src, VEX_SIMD_F2);
3065   emit_byte(0x5E);
3066   emit_operand(dst, src);
3067 }
3068 
3069 void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3070   assert(VM_Version::supports_avx(), "");
3071   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
3072   emit_byte(0x5E);
3073   emit_byte(0xC0 | encode);
3074 }
3075 
3076 void Assembler::vdivss(XMMRegister dst, XMMRegister nds, Address src) {
3077   assert(VM_Version::supports_avx(), "");
3078   InstructionMark im(this);
3079   vex_prefix(dst, nds, src, VEX_SIMD_F3);
3080   emit_byte(0x5E);
3081   emit_operand(dst, src);
3082 }
3083 
3084 void Assembler::vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3085   assert(VM_Version::supports_avx(), "");
3086   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
3087   emit_byte(0x5E);
3088   emit_byte(0xC0 | encode);
3089 }
3090 
3091 void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) {
3092   assert(VM_Version::supports_avx(), "");
3093   InstructionMark im(this);
3094   vex_prefix(dst, nds, src, VEX_SIMD_F2);
3095   emit_byte(0x59);
3096   emit_operand(dst, src);
3097 }
3098 
3099 void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3100   assert(VM_Version::supports_avx(), "");
3101   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
3102   emit_byte(0x59);
3103   emit_byte(0xC0 | encode);
3104 }
3105 
3106 void Assembler::vmulss(XMMRegister dst, XMMRegister nds, Address src) {
3107   InstructionMark im(this);
3108   vex_prefix(dst, nds, src, VEX_SIMD_F3);
3109   emit_byte(0x59);
3110   emit_operand(dst, src);
3111 }
3112 
3113 void Assembler::vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3114   assert(VM_Version::supports_avx(), "");
3115   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
3116   emit_byte(0x59);
3117   emit_byte(0xC0 | encode);
3118 }
3119 
3120 
3121 void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, Address src) {
3122   assert(VM_Version::supports_avx(), "");
3123   InstructionMark im(this);
3124   vex_prefix(dst, nds, src, VEX_SIMD_F2);
3125   emit_byte(0x5C);
3126   emit_operand(dst, src);
3127 }
3128 
3129 void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3130   assert(VM_Version::supports_avx(), "");
3131   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F2);
3132   emit_byte(0x5C);
3133   emit_byte(0xC0 | encode);
3134 }
3135 
3136 void Assembler::vsubss(XMMRegister dst, XMMRegister nds, Address src) {
3137   assert(VM_Version::supports_avx(), "");
3138   InstructionMark im(this);
3139   vex_prefix(dst, nds, src, VEX_SIMD_F3);
3140   emit_byte(0x5C);
3141   emit_operand(dst, src);
3142 }
3143 
3144 void Assembler::vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3145   assert(VM_Version::supports_avx(), "");
3146   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_F3);
3147   emit_byte(0x5C);
3148   emit_byte(0xC0 | encode);
3149 }
3150 
3151 void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, Address src) {
3152   assert(VM_Version::supports_avx(), "");
3153   InstructionMark im(this);
3154   vex_prefix(dst, nds, src, VEX_SIMD_66); // 128-bit vector
3155   emit_byte(0x57);
3156   emit_operand(dst, src);
3157 }
3158 
3159 void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3160   assert(VM_Version::supports_avx(), "");
3161   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256);
3162   emit_byte(0x57);
3163   emit_byte(0xC0 | encode);
3164 }
3165 
3166 void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src) {
3167   assert(VM_Version::supports_avx(), "");
3168   InstructionMark im(this);
3169   vex_prefix(dst, nds, src, VEX_SIMD_NONE); // 128-bit vector
3170   emit_byte(0x57);
3171   emit_operand(dst, src);
3172 }
3173 
3174 void Assembler::vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3175   assert(VM_Version::supports_avx(), "");
3176   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_NONE, vector256);
3177   emit_byte(0x57);
3178   emit_byte(0xC0 | encode);
3179 }
3180 
3181 void Assembler::vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3182   assert(VM_Version::supports_avx(), "");
3183   bool vector256 = true;
3184   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A);
3185   emit_byte(0x18);
3186   emit_byte(0xC0 | encode);
3187   // 0x00 - insert into lower 128 bits
3188   // 0x01 - insert into upper 128 bits
3189   emit_byte(0x01);
3190 }
3191 
3192 void Assembler::vzeroupper() {
3193   assert(VM_Version::supports_avx(), "");
3194   (void)vex_prefix_and_encode(xmm0, xmm0, xmm0, VEX_SIMD_NONE);
3195   emit_byte(0x77);
3196 }
3197 
3198 
3199 #ifndef _LP64
3200 // 32bit only pieces of the assembler
3201 
3202 void Assembler::cmp_literal32(Register src1, int32_t imm32, RelocationHolder const& rspec) {
3203   // NO PREFIX AS NEVER 64BIT
3204   InstructionMark im(this);
3205   emit_byte(0x81);
3206   emit_byte(0xF8 | src1->encoding());
3207   emit_data(imm32, rspec, 0);
3208 }
3209 
3210 void Assembler::cmp_literal32(Address src1, int32_t imm32, RelocationHolder const& rspec) {
3211   // NO PREFIX AS NEVER 64BIT (not even 32bit versions of 64bit regs
3212   InstructionMark im(this);
3213   emit_byte(0x81);
3214   emit_operand(rdi, src1);
3215   emit_data(imm32, rspec, 0);
3216 }
3217 
3218 // The 64-bit (32bit platform) cmpxchg compares the value at adr with the contents of rdx:rax,
3219 // and stores rcx:rbx into adr if so; otherwise, the value at adr is loaded
3220 // into rdx:rax.  The ZF is set if the compared values were equal, and cleared otherwise.
3221 void Assembler::cmpxchg8(Address adr) {
3222   InstructionMark im(this);
3223   emit_byte(0x0F);
3224   emit_byte(0xc7);
3225   emit_operand(rcx, adr);
3226 }
3227 
3228 void Assembler::decl(Register dst) {
3229   // Don't use it directly. Use MacroAssembler::decrementl() instead.
3230  emit_byte(0x48 | dst->encoding());
3231 }
3232 
3233 #endif // _LP64
3234 
3235 // 64bit typically doesn't use the x87 but needs to for the trig funcs
3236 
3237 void Assembler::fabs() {
3238   emit_byte(0xD9);
3239   emit_byte(0xE1);
3240 }
3241 
3242 void Assembler::fadd(int i) {
3243   emit_farith(0xD8, 0xC0, i);
3244 }
3245 
3246 void Assembler::fadd_d(Address src) {
3247   InstructionMark im(this);
3248   emit_byte(0xDC);
3249   emit_operand32(rax, src);
3250 }
3251 
3252 void Assembler::fadd_s(Address src) {
3253   InstructionMark im(this);
3254   emit_byte(0xD8);
3255   emit_operand32(rax, src);
3256 }
3257 
3258 void Assembler::fadda(int i) {
3259   emit_farith(0xDC, 0xC0, i);
3260 }
3261 
3262 void Assembler::faddp(int i) {
3263   emit_farith(0xDE, 0xC0, i);
3264 }
3265 
3266 void Assembler::fchs() {
3267   emit_byte(0xD9);
3268   emit_byte(0xE0);
3269 }
3270 
3271 void Assembler::fcom(int i) {
3272   emit_farith(0xD8, 0xD0, i);
3273 }
3274 
3275 void Assembler::fcomp(int i) {
3276   emit_farith(0xD8, 0xD8, i);
3277 }
3278 
3279 void Assembler::fcomp_d(Address src) {
3280   InstructionMark im(this);
3281   emit_byte(0xDC);
3282   emit_operand32(rbx, src);
3283 }
3284 
3285 void Assembler::fcomp_s(Address src) {
3286   InstructionMark im(this);
3287   emit_byte(0xD8);
3288   emit_operand32(rbx, src);
3289 }
3290 
3291 void Assembler::fcompp() {
3292   emit_byte(0xDE);
3293   emit_byte(0xD9);
3294 }
3295 
3296 void Assembler::fcos() {
3297   emit_byte(0xD9);
3298   emit_byte(0xFF);
3299 }
3300 
3301 void Assembler::fdecstp() {
3302   emit_byte(0xD9);
3303   emit_byte(0xF6);
3304 }
3305 
3306 void Assembler::fdiv(int i) {
3307   emit_farith(0xD8, 0xF0, i);
3308 }
3309 
3310 void Assembler::fdiv_d(Address src) {
3311   InstructionMark im(this);
3312   emit_byte(0xDC);
3313   emit_operand32(rsi, src);
3314 }
3315 
3316 void Assembler::fdiv_s(Address src) {
3317   InstructionMark im(this);
3318   emit_byte(0xD8);
3319   emit_operand32(rsi, src);
3320 }
3321 
3322 void Assembler::fdiva(int i) {
3323   emit_farith(0xDC, 0xF8, i);
3324 }
3325 
3326 // Note: The Intel manual (Pentium Processor User's Manual, Vol.3, 1994)
3327 //       is erroneous for some of the floating-point instructions below.
3328 
3329 void Assembler::fdivp(int i) {
3330   emit_farith(0xDE, 0xF8, i);                    // ST(0) <- ST(0) / ST(1) and pop (Intel manual wrong)
3331 }
3332 
3333 void Assembler::fdivr(int i) {
3334   emit_farith(0xD8, 0xF8, i);
3335 }
3336 
3337 void Assembler::fdivr_d(Address src) {
3338   InstructionMark im(this);
3339   emit_byte(0xDC);
3340   emit_operand32(rdi, src);
3341 }
3342 
3343 void Assembler::fdivr_s(Address src) {
3344   InstructionMark im(this);
3345   emit_byte(0xD8);
3346   emit_operand32(rdi, src);
3347 }
3348 
3349 void Assembler::fdivra(int i) {
3350   emit_farith(0xDC, 0xF0, i);
3351 }
3352 
3353 void Assembler::fdivrp(int i) {
3354   emit_farith(0xDE, 0xF0, i);                    // ST(0) <- ST(1) / ST(0) and pop (Intel manual wrong)
3355 }
3356 
3357 void Assembler::ffree(int i) {
3358   emit_farith(0xDD, 0xC0, i);
3359 }
3360 
3361 void Assembler::fild_d(Address adr) {
3362   InstructionMark im(this);
3363   emit_byte(0xDF);
3364   emit_operand32(rbp, adr);
3365 }
3366 
3367 void Assembler::fild_s(Address adr) {
3368   InstructionMark im(this);
3369   emit_byte(0xDB);
3370   emit_operand32(rax, adr);
3371 }
3372 
3373 void Assembler::fincstp() {
3374   emit_byte(0xD9);
3375   emit_byte(0xF7);
3376 }
3377 
3378 void Assembler::finit() {
3379   emit_byte(0x9B);
3380   emit_byte(0xDB);
3381   emit_byte(0xE3);
3382 }
3383 
3384 void Assembler::fist_s(Address adr) {
3385   InstructionMark im(this);
3386   emit_byte(0xDB);
3387   emit_operand32(rdx, adr);
3388 }
3389 
3390 void Assembler::fistp_d(Address adr) {
3391   InstructionMark im(this);
3392   emit_byte(0xDF);
3393   emit_operand32(rdi, adr);
3394 }
3395 
3396 void Assembler::fistp_s(Address adr) {
3397   InstructionMark im(this);
3398   emit_byte(0xDB);
3399   emit_operand32(rbx, adr);
3400 }
3401 
3402 void Assembler::fld1() {
3403   emit_byte(0xD9);
3404   emit_byte(0xE8);
3405 }
3406 
3407 void Assembler::fld_d(Address adr) {
3408   InstructionMark im(this);
3409   emit_byte(0xDD);
3410   emit_operand32(rax, adr);
3411 }
3412 
3413 void Assembler::fld_s(Address adr) {
3414   InstructionMark im(this);
3415   emit_byte(0xD9);
3416   emit_operand32(rax, adr);
3417 }
3418 
3419 
3420 void Assembler::fld_s(int index) {
3421   emit_farith(0xD9, 0xC0, index);
3422 }
3423 
3424 void Assembler::fld_x(Address adr) {
3425   InstructionMark im(this);
3426   emit_byte(0xDB);
3427   emit_operand32(rbp, adr);
3428 }
3429 
3430 void Assembler::fldcw(Address src) {
3431   InstructionMark im(this);
3432   emit_byte(0xd9);
3433   emit_operand32(rbp, src);
3434 }
3435 
3436 void Assembler::fldenv(Address src) {
3437   InstructionMark im(this);
3438   emit_byte(0xD9);
3439   emit_operand32(rsp, src);
3440 }
3441 
3442 void Assembler::fldlg2() {
3443   emit_byte(0xD9);
3444   emit_byte(0xEC);
3445 }
3446 
3447 void Assembler::fldln2() {
3448   emit_byte(0xD9);
3449   emit_byte(0xED);
3450 }
3451 
3452 void Assembler::fldz() {
3453   emit_byte(0xD9);
3454   emit_byte(0xEE);
3455 }
3456 
3457 void Assembler::flog() {
3458   fldln2();
3459   fxch();
3460   fyl2x();
3461 }
3462 
3463 void Assembler::flog10() {
3464   fldlg2();
3465   fxch();
3466   fyl2x();
3467 }
3468 
3469 void Assembler::fmul(int i) {
3470   emit_farith(0xD8, 0xC8, i);
3471 }
3472 
3473 void Assembler::fmul_d(Address src) {
3474   InstructionMark im(this);
3475   emit_byte(0xDC);
3476   emit_operand32(rcx, src);
3477 }
3478 
3479 void Assembler::fmul_s(Address src) {
3480   InstructionMark im(this);
3481   emit_byte(0xD8);
3482   emit_operand32(rcx, src);
3483 }
3484 
3485 void Assembler::fmula(int i) {
3486   emit_farith(0xDC, 0xC8, i);
3487 }
3488 
3489 void Assembler::fmulp(int i) {
3490   emit_farith(0xDE, 0xC8, i);
3491 }
3492 
3493 void Assembler::fnsave(Address dst) {
3494   InstructionMark im(this);
3495   emit_byte(0xDD);
3496   emit_operand32(rsi, dst);
3497 }
3498 
3499 void Assembler::fnstcw(Address src) {
3500   InstructionMark im(this);
3501   emit_byte(0x9B);
3502   emit_byte(0xD9);
3503   emit_operand32(rdi, src);
3504 }
3505 
3506 void Assembler::fnstsw_ax() {
3507   emit_byte(0xdF);
3508   emit_byte(0xE0);
3509 }
3510 
3511 void Assembler::fprem() {
3512   emit_byte(0xD9);
3513   emit_byte(0xF8);
3514 }
3515 
3516 void Assembler::fprem1() {
3517   emit_byte(0xD9);
3518   emit_byte(0xF5);
3519 }
3520 
3521 void Assembler::frstor(Address src) {
3522   InstructionMark im(this);
3523   emit_byte(0xDD);
3524   emit_operand32(rsp, src);
3525 }
3526 
3527 void Assembler::fsin() {
3528   emit_byte(0xD9);
3529   emit_byte(0xFE);
3530 }
3531 
3532 void Assembler::fsqrt() {
3533   emit_byte(0xD9);
3534   emit_byte(0xFA);
3535 }
3536 
3537 void Assembler::fst_d(Address adr) {
3538   InstructionMark im(this);
3539   emit_byte(0xDD);
3540   emit_operand32(rdx, adr);
3541 }
3542 
3543 void Assembler::fst_s(Address adr) {
3544   InstructionMark im(this);
3545   emit_byte(0xD9);
3546   emit_operand32(rdx, adr);
3547 }
3548 
3549 void Assembler::fstp_d(Address adr) {
3550   InstructionMark im(this);
3551   emit_byte(0xDD);
3552   emit_operand32(rbx, adr);
3553 }
3554 
3555 void Assembler::fstp_d(int index) {
3556   emit_farith(0xDD, 0xD8, index);
3557 }
3558 
3559 void Assembler::fstp_s(Address adr) {
3560   InstructionMark im(this);
3561   emit_byte(0xD9);
3562   emit_operand32(rbx, adr);
3563 }
3564 
3565 void Assembler::fstp_x(Address adr) {
3566   InstructionMark im(this);
3567   emit_byte(0xDB);
3568   emit_operand32(rdi, adr);
3569 }
3570 
3571 void Assembler::fsub(int i) {
3572   emit_farith(0xD8, 0xE0, i);
3573 }
3574 
3575 void Assembler::fsub_d(Address src) {
3576   InstructionMark im(this);
3577   emit_byte(0xDC);
3578   emit_operand32(rsp, src);
3579 }
3580 
3581 void Assembler::fsub_s(Address src) {
3582   InstructionMark im(this);
3583   emit_byte(0xD8);
3584   emit_operand32(rsp, src);
3585 }
3586 
3587 void Assembler::fsuba(int i) {
3588   emit_farith(0xDC, 0xE8, i);
3589 }
3590 
3591 void Assembler::fsubp(int i) {
3592   emit_farith(0xDE, 0xE8, i);                    // ST(0) <- ST(0) - ST(1) and pop (Intel manual wrong)
3593 }
3594 
3595 void Assembler::fsubr(int i) {
3596   emit_farith(0xD8, 0xE8, i);
3597 }
3598 
3599 void Assembler::fsubr_d(Address src) {
3600   InstructionMark im(this);
3601   emit_byte(0xDC);
3602   emit_operand32(rbp, src);
3603 }
3604 
3605 void Assembler::fsubr_s(Address src) {
3606   InstructionMark im(this);
3607   emit_byte(0xD8);
3608   emit_operand32(rbp, src);
3609 }
3610 
3611 void Assembler::fsubra(int i) {
3612   emit_farith(0xDC, 0xE0, i);
3613 }
3614 
3615 void Assembler::fsubrp(int i) {
3616   emit_farith(0xDE, 0xE0, i);                    // ST(0) <- ST(1) - ST(0) and pop (Intel manual wrong)
3617 }
3618 
3619 void Assembler::ftan() {
3620   emit_byte(0xD9);
3621   emit_byte(0xF2);
3622   emit_byte(0xDD);
3623   emit_byte(0xD8);
3624 }
3625 
3626 void Assembler::ftst() {
3627   emit_byte(0xD9);
3628   emit_byte(0xE4);
3629 }
3630 
3631 void Assembler::fucomi(int i) {
3632   // make sure the instruction is supported (introduced for P6, together with cmov)
3633   guarantee(VM_Version::supports_cmov(), "illegal instruction");
3634   emit_farith(0xDB, 0xE8, i);
3635 }
3636 
3637 void Assembler::fucomip(int i) {
3638   // make sure the instruction is supported (introduced for P6, together with cmov)
3639   guarantee(VM_Version::supports_cmov(), "illegal instruction");
3640   emit_farith(0xDF, 0xE8, i);
3641 }
3642 
3643 void Assembler::fwait() {
3644   emit_byte(0x9B);
3645 }
3646 
3647 void Assembler::fxch(int i) {
3648   emit_farith(0xD9, 0xC8, i);
3649 }
3650 
3651 void Assembler::fyl2x() {
3652   emit_byte(0xD9);
3653   emit_byte(0xF1);
3654 }
3655 
3656 void Assembler::frndint() {
3657   emit_byte(0xD9);
3658   emit_byte(0xFC);
3659 }
3660 
3661 void Assembler::f2xm1() {
3662   emit_byte(0xD9);
3663   emit_byte(0xF0);
3664 }
3665 
3666 void Assembler::fldl2e() {
3667   emit_byte(0xD9);
3668   emit_byte(0xEA);
3669 }
3670 
3671 // SSE SIMD prefix byte values corresponding to VexSimdPrefix encoding.
3672 static int simd_pre[4] = { 0, 0x66, 0xF3, 0xF2 };
3673 // SSE opcode second byte values (first is 0x0F) corresponding to VexOpcode encoding.
3674 static int simd_opc[4] = { 0,    0, 0x38, 0x3A };
3675 
3676 // Generate SSE legacy REX prefix and SIMD opcode based on VEX encoding.
3677 void Assembler::rex_prefix(Address adr, XMMRegister xreg, VexSimdPrefix pre, VexOpcode opc, bool rex_w) {
3678   if (pre > 0) {
3679     emit_byte(simd_pre[pre]);
3680   }
3681   if (rex_w) {
3682     prefixq(adr, xreg);
3683   } else {
3684     prefix(adr, xreg);
3685   }
3686   if (opc > 0) {
3687     emit_byte(0x0F);
3688     int opc2 = simd_opc[opc];
3689     if (opc2 > 0) {
3690       emit_byte(opc2);
3691     }
3692   }
3693 }
3694 
3695 int Assembler::rex_prefix_and_encode(int dst_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc, bool rex_w) {
3696   if (pre > 0) {
3697     emit_byte(simd_pre[pre]);
3698   }
3699   int encode = (rex_w) ? prefixq_and_encode(dst_enc, src_enc) :
3700                           prefix_and_encode(dst_enc, src_enc);
3701   if (opc > 0) {
3702     emit_byte(0x0F);
3703     int opc2 = simd_opc[opc];
3704     if (opc2 > 0) {
3705       emit_byte(opc2);
3706     }
3707   }
3708   return encode;
3709 }
3710 
3711 
3712 void Assembler::vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w, int nds_enc, VexSimdPrefix pre, VexOpcode opc, bool vector256) {
3713   if (vex_b || vex_x || vex_w || (opc == VEX_OPCODE_0F_38) || (opc == VEX_OPCODE_0F_3A)) {
3714     prefix(VEX_3bytes);
3715 
3716     int byte1 = (vex_r ? VEX_R : 0) | (vex_x ? VEX_X : 0) | (vex_b ? VEX_B : 0);
3717     byte1 = (~byte1) & 0xE0;
3718     byte1 |= opc;
3719     a_byte(byte1);
3720 
3721     int byte2 = ((~nds_enc) & 0xf) << 3;
3722     byte2 |= (vex_w ? VEX_W : 0) | (vector256 ? 4 : 0) | pre;
3723     emit_byte(byte2);
3724   } else {
3725     prefix(VEX_2bytes);
3726 
3727     int byte1 = vex_r ? VEX_R : 0;
3728     byte1 = (~byte1) & 0x80;
3729     byte1 |= ((~nds_enc) & 0xf) << 3;
3730     byte1 |= (vector256 ? 4 : 0) | pre;
3731     emit_byte(byte1);
3732   }
3733 }
3734 
3735 void Assembler::vex_prefix(Address adr, int nds_enc, int xreg_enc, VexSimdPrefix pre, VexOpcode opc, bool vex_w, bool vector256){
3736   bool vex_r = (xreg_enc >= 8);
3737   bool vex_b = adr.base_needs_rex();
3738   bool vex_x = adr.index_needs_rex();
3739   vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector256);
3740 }
3741 
3742 int Assembler::vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc, bool vex_w, bool vector256) {
3743   bool vex_r = (dst_enc >= 8);
3744   bool vex_b = (src_enc >= 8);
3745   bool vex_x = false;
3746   vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector256);
3747   return (((dst_enc & 7) << 3) | (src_enc & 7));
3748 }
3749 
3750 
3751 void Assembler::simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr, VexSimdPrefix pre, VexOpcode opc, bool rex_w, bool vector256) {
3752   if (UseAVX > 0) {
3753     int xreg_enc = xreg->encoding();
3754     int  nds_enc = nds->is_valid() ? nds->encoding() : 0;
3755     vex_prefix(adr, nds_enc, xreg_enc, pre, opc, rex_w, vector256);
3756   } else {
3757     assert((nds == xreg) || (nds == xnoreg), "wrong sse encoding");
3758     rex_prefix(adr, xreg, pre, opc, rex_w);
3759   }
3760 }
3761 
3762 int Assembler::simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, VexSimdPrefix pre, VexOpcode opc, bool rex_w, bool vector256) {
3763   int dst_enc = dst->encoding();
3764   int src_enc = src->encoding();
3765   if (UseAVX > 0) {
3766     int nds_enc = nds->is_valid() ? nds->encoding() : 0;
3767     return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, rex_w, vector256);
3768   } else {
3769     assert((nds == dst) || (nds == src) || (nds == xnoreg), "wrong sse encoding");
3770     return rex_prefix_and_encode(dst_enc, src_enc, pre, opc, rex_w);
3771   }
3772 }
3773 
3774 #ifndef _LP64
3775 
3776 void Assembler::incl(Register dst) {
3777   // Don't use it directly. Use MacroAssembler::incrementl() instead.
3778   emit_byte(0x40 | dst->encoding());
3779 }
3780 
3781 void Assembler::lea(Register dst, Address src) {
3782   leal(dst, src);
3783 }
3784 
3785 void Assembler::mov_literal32(Address dst, int32_t imm32,  RelocationHolder const& rspec) {
3786   InstructionMark im(this);
3787   emit_byte(0xC7);
3788   emit_operand(rax, dst);
3789   emit_data((int)imm32, rspec, 0);
3790 }
3791 
3792 void Assembler::mov_literal32(Register dst, int32_t imm32, RelocationHolder const& rspec) {
3793   InstructionMark im(this);
3794   int encode = prefix_and_encode(dst->encoding());
3795   emit_byte(0xB8 | encode);
3796   emit_data((int)imm32, rspec, 0);
3797 }
3798 
3799 void Assembler::popa() { // 32bit
3800   emit_byte(0x61);
3801 }
3802 
3803 void Assembler::push_literal32(int32_t imm32, RelocationHolder const& rspec) {
3804   InstructionMark im(this);
3805   emit_byte(0x68);
3806   emit_data(imm32, rspec, 0);
3807 }
3808 
3809 void Assembler::pusha() { // 32bit
3810   emit_byte(0x60);
3811 }
3812 
3813 void Assembler::set_byte_if_not_zero(Register dst) {
3814   emit_byte(0x0F);
3815   emit_byte(0x95);
3816   emit_byte(0xE0 | dst->encoding());
3817 }
3818 
3819 void Assembler::shldl(Register dst, Register src) {
3820   emit_byte(0x0F);
3821   emit_byte(0xA5);
3822   emit_byte(0xC0 | src->encoding() << 3 | dst->encoding());
3823 }
3824 
3825 void Assembler::shrdl(Register dst, Register src) {
3826   emit_byte(0x0F);
3827   emit_byte(0xAD);
3828   emit_byte(0xC0 | src->encoding() << 3 | dst->encoding());
3829 }
3830 
3831 #else // LP64
3832 
3833 void Assembler::set_byte_if_not_zero(Register dst) {
3834   int enc = prefix_and_encode(dst->encoding(), true);
3835   emit_byte(0x0F);
3836   emit_byte(0x95);
3837   emit_byte(0xE0 | enc);
3838 }
3839 
3840 // 64bit only pieces of the assembler
3841 // This should only be used by 64bit instructions that can use rip-relative
3842 // it cannot be used by instructions that want an immediate value.
3843 
3844 bool Assembler::reachable(AddressLiteral adr) {
3845   int64_t disp;
3846   // None will force a 64bit literal to the code stream. Likely a placeholder
3847   // for something that will be patched later and we need to certain it will
3848   // always be reachable.
3849   if (adr.reloc() == relocInfo::none) {
3850     return false;
3851   }
3852   if (adr.reloc() == relocInfo::internal_word_type) {
3853     // This should be rip relative and easily reachable.
3854     return true;
3855   }
3856   if (adr.reloc() == relocInfo::virtual_call_type ||
3857       adr.reloc() == relocInfo::opt_virtual_call_type ||
3858       adr.reloc() == relocInfo::static_call_type ||
3859       adr.reloc() == relocInfo::static_stub_type ) {
3860     // This should be rip relative within the code cache and easily
3861     // reachable until we get huge code caches. (At which point
3862     // ic code is going to have issues).
3863     return true;
3864   }
3865   if (adr.reloc() != relocInfo::external_word_type &&
3866       adr.reloc() != relocInfo::poll_return_type &&  // these are really external_word but need special
3867       adr.reloc() != relocInfo::poll_type &&         // relocs to identify them
3868       adr.reloc() != relocInfo::runtime_call_type ) {
3869     return false;
3870   }
3871 
3872   // Stress the correction code
3873   if (ForceUnreachable) {
3874     // Must be runtimecall reloc, see if it is in the codecache
3875     // Flipping stuff in the codecache to be unreachable causes issues
3876     // with things like inline caches where the additional instructions
3877     // are not handled.
3878     if (CodeCache::find_blob(adr._target) == NULL) {
3879       return false;
3880     }
3881   }
3882   // For external_word_type/runtime_call_type if it is reachable from where we
3883   // are now (possibly a temp buffer) and where we might end up
3884   // anywhere in the codeCache then we are always reachable.
3885   // This would have to change if we ever save/restore shared code
3886   // to be more pessimistic.
3887   disp = (int64_t)adr._target - ((int64_t)CodeCache::low_bound() + sizeof(int));
3888   if (!is_simm32(disp)) return false;
3889   disp = (int64_t)adr._target - ((int64_t)CodeCache::high_bound() + sizeof(int));
3890   if (!is_simm32(disp)) return false;
3891 
3892   disp = (int64_t)adr._target - ((int64_t)_code_pos + sizeof(int));
3893 
3894   // Because rip relative is a disp + address_of_next_instruction and we
3895   // don't know the value of address_of_next_instruction we apply a fudge factor
3896   // to make sure we will be ok no matter the size of the instruction we get placed into.
3897   // We don't have to fudge the checks above here because they are already worst case.
3898 
3899   // 12 == override/rex byte, opcode byte, rm byte, sib byte, a 4-byte disp , 4-byte literal
3900   // + 4 because better safe than sorry.
3901   const int fudge = 12 + 4;
3902   if (disp < 0) {
3903     disp -= fudge;
3904   } else {
3905     disp += fudge;
3906   }
3907   return is_simm32(disp);
3908 }
3909 
3910 // Check if the polling page is not reachable from the code cache using rip-relative
3911 // addressing.
3912 bool Assembler::is_polling_page_far() {
3913   intptr_t addr = (intptr_t)os::get_polling_page();
3914   return ForceUnreachable ||
3915          !is_simm32(addr - (intptr_t)CodeCache::low_bound()) ||
3916          !is_simm32(addr - (intptr_t)CodeCache::high_bound());
3917 }
3918 
3919 void Assembler::emit_data64(jlong data,
3920                             relocInfo::relocType rtype,
3921                             int format) {
3922   if (rtype == relocInfo::none) {
3923     emit_long64(data);
3924   } else {
3925     emit_data64(data, Relocation::spec_simple(rtype), format);
3926   }
3927 }
3928 
3929 void Assembler::emit_data64(jlong data,
3930                             RelocationHolder const& rspec,
3931                             int format) {
3932   assert(imm_operand == 0, "default format must be immediate in this file");
3933   assert(imm_operand == format, "must be immediate");
3934   assert(inst_mark() != NULL, "must be inside InstructionMark");
3935   // Do not use AbstractAssembler::relocate, which is not intended for
3936   // embedded words.  Instead, relocate to the enclosing instruction.
3937   code_section()->relocate(inst_mark(), rspec, format);
3938 #ifdef ASSERT
3939   check_relocation(rspec, format);
3940 #endif
3941   emit_long64(data);
3942 }
3943 
3944 int Assembler::prefix_and_encode(int reg_enc, bool byteinst) {
3945   if (reg_enc >= 8) {
3946     prefix(REX_B);
3947     reg_enc -= 8;
3948   } else if (byteinst && reg_enc >= 4) {
3949     prefix(REX);
3950   }
3951   return reg_enc;
3952 }
3953 
3954 int Assembler::prefixq_and_encode(int reg_enc) {
3955   if (reg_enc < 8) {
3956     prefix(REX_W);
3957   } else {
3958     prefix(REX_WB);
3959     reg_enc -= 8;
3960   }
3961   return reg_enc;
3962 }
3963 
3964 int Assembler::prefix_and_encode(int dst_enc, int src_enc, bool byteinst) {
3965   if (dst_enc < 8) {
3966     if (src_enc >= 8) {
3967       prefix(REX_B);
3968       src_enc -= 8;
3969     } else if (byteinst && src_enc >= 4) {
3970       prefix(REX);
3971     }
3972   } else {
3973     if (src_enc < 8) {
3974       prefix(REX_R);
3975     } else {
3976       prefix(REX_RB);
3977       src_enc -= 8;
3978     }
3979     dst_enc -= 8;
3980   }
3981   return dst_enc << 3 | src_enc;
3982 }
3983 
3984 int Assembler::prefixq_and_encode(int dst_enc, int src_enc) {
3985   if (dst_enc < 8) {
3986     if (src_enc < 8) {
3987       prefix(REX_W);
3988     } else {
3989       prefix(REX_WB);
3990       src_enc -= 8;
3991     }
3992   } else {
3993     if (src_enc < 8) {
3994       prefix(REX_WR);
3995     } else {
3996       prefix(REX_WRB);
3997       src_enc -= 8;
3998     }
3999     dst_enc -= 8;
4000   }
4001   return dst_enc << 3 | src_enc;
4002 }
4003 
4004 void Assembler::prefix(Register reg) {
4005   if (reg->encoding() >= 8) {
4006     prefix(REX_B);
4007   }
4008 }
4009 
4010 void Assembler::prefix(Address adr) {
4011   if (adr.base_needs_rex()) {
4012     if (adr.index_needs_rex()) {
4013       prefix(REX_XB);
4014     } else {
4015       prefix(REX_B);
4016     }
4017   } else {
4018     if (adr.index_needs_rex()) {
4019       prefix(REX_X);
4020     }
4021   }
4022 }
4023 
4024 void Assembler::prefixq(Address adr) {
4025   if (adr.base_needs_rex()) {
4026     if (adr.index_needs_rex()) {
4027       prefix(REX_WXB);
4028     } else {
4029       prefix(REX_WB);
4030     }
4031   } else {
4032     if (adr.index_needs_rex()) {
4033       prefix(REX_WX);
4034     } else {
4035       prefix(REX_W);
4036     }
4037   }
4038 }
4039 
4040 
4041 void Assembler::prefix(Address adr, Register reg, bool byteinst) {
4042   if (reg->encoding() < 8) {
4043     if (adr.base_needs_rex()) {
4044       if (adr.index_needs_rex()) {
4045         prefix(REX_XB);
4046       } else {
4047         prefix(REX_B);
4048       }
4049     } else {
4050       if (adr.index_needs_rex()) {
4051         prefix(REX_X);
4052       } else if (byteinst && reg->encoding() >= 4 ) {
4053         prefix(REX);
4054       }
4055     }
4056   } else {
4057     if (adr.base_needs_rex()) {
4058       if (adr.index_needs_rex()) {
4059         prefix(REX_RXB);
4060       } else {
4061         prefix(REX_RB);
4062       }
4063     } else {
4064       if (adr.index_needs_rex()) {
4065         prefix(REX_RX);
4066       } else {
4067         prefix(REX_R);
4068       }
4069     }
4070   }
4071 }
4072 
4073 void Assembler::prefixq(Address adr, Register src) {
4074   if (src->encoding() < 8) {
4075     if (adr.base_needs_rex()) {
4076       if (adr.index_needs_rex()) {
4077         prefix(REX_WXB);
4078       } else {
4079         prefix(REX_WB);
4080       }
4081     } else {
4082       if (adr.index_needs_rex()) {
4083         prefix(REX_WX);
4084       } else {
4085         prefix(REX_W);
4086       }
4087     }
4088   } else {
4089     if (adr.base_needs_rex()) {
4090       if (adr.index_needs_rex()) {
4091         prefix(REX_WRXB);
4092       } else {
4093         prefix(REX_WRB);
4094       }
4095     } else {
4096       if (adr.index_needs_rex()) {
4097         prefix(REX_WRX);
4098       } else {
4099         prefix(REX_WR);
4100       }
4101     }
4102   }
4103 }
4104 
4105 void Assembler::prefix(Address adr, XMMRegister reg) {
4106   if (reg->encoding() < 8) {
4107     if (adr.base_needs_rex()) {
4108       if (adr.index_needs_rex()) {
4109         prefix(REX_XB);
4110       } else {
4111         prefix(REX_B);
4112       }
4113     } else {
4114       if (adr.index_needs_rex()) {
4115         prefix(REX_X);
4116       }
4117     }
4118   } else {
4119     if (adr.base_needs_rex()) {
4120       if (adr.index_needs_rex()) {
4121         prefix(REX_RXB);
4122       } else {
4123         prefix(REX_RB);
4124       }
4125     } else {
4126       if (adr.index_needs_rex()) {
4127         prefix(REX_RX);
4128       } else {
4129         prefix(REX_R);
4130       }
4131     }
4132   }
4133 }
4134 
4135 void Assembler::prefixq(Address adr, XMMRegister src) {
4136   if (src->encoding() < 8) {
4137     if (adr.base_needs_rex()) {
4138       if (adr.index_needs_rex()) {
4139         prefix(REX_WXB);
4140       } else {
4141         prefix(REX_WB);
4142       }
4143     } else {
4144       if (adr.index_needs_rex()) {
4145         prefix(REX_WX);
4146       } else {
4147         prefix(REX_W);
4148       }
4149     }
4150   } else {
4151     if (adr.base_needs_rex()) {
4152       if (adr.index_needs_rex()) {
4153         prefix(REX_WRXB);
4154       } else {
4155         prefix(REX_WRB);
4156       }
4157     } else {
4158       if (adr.index_needs_rex()) {
4159         prefix(REX_WRX);
4160       } else {
4161         prefix(REX_WR);
4162       }
4163     }
4164   }
4165 }
4166 
4167 void Assembler::adcq(Register dst, int32_t imm32) {
4168   (void) prefixq_and_encode(dst->encoding());
4169   emit_arith(0x81, 0xD0, dst, imm32);
4170 }
4171 
4172 void Assembler::adcq(Register dst, Address src) {
4173   InstructionMark im(this);
4174   prefixq(src, dst);
4175   emit_byte(0x13);
4176   emit_operand(dst, src);
4177 }
4178 
4179 void Assembler::adcq(Register dst, Register src) {
4180   (int) prefixq_and_encode(dst->encoding(), src->encoding());
4181   emit_arith(0x13, 0xC0, dst, src);
4182 }
4183 
4184 void Assembler::addq(Address dst, int32_t imm32) {
4185   InstructionMark im(this);
4186   prefixq(dst);
4187   emit_arith_operand(0x81, rax, dst,imm32);
4188 }
4189 
4190 void Assembler::addq(Address dst, Register src) {
4191   InstructionMark im(this);
4192   prefixq(dst, src);
4193   emit_byte(0x01);
4194   emit_operand(src, dst);
4195 }
4196 
4197 void Assembler::addq(Register dst, int32_t imm32) {
4198   (void) prefixq_and_encode(dst->encoding());
4199   emit_arith(0x81, 0xC0, dst, imm32);
4200 }
4201 
4202 void Assembler::addq(Register dst, Address src) {
4203   InstructionMark im(this);
4204   prefixq(src, dst);
4205   emit_byte(0x03);
4206   emit_operand(dst, src);
4207 }
4208 
4209 void Assembler::addq(Register dst, Register src) {
4210   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4211   emit_arith(0x03, 0xC0, dst, src);
4212 }
4213 
4214 void Assembler::andq(Address dst, int32_t imm32) {
4215   InstructionMark im(this);
4216   prefixq(dst);
4217   emit_byte(0x81);
4218   emit_operand(rsp, dst, 4);
4219   emit_long(imm32);
4220 }
4221 
4222 void Assembler::andq(Register dst, int32_t imm32) {
4223   (void) prefixq_and_encode(dst->encoding());
4224   emit_arith(0x81, 0xE0, dst, imm32);
4225 }
4226 
4227 void Assembler::andq(Register dst, Address src) {
4228   InstructionMark im(this);
4229   prefixq(src, dst);
4230   emit_byte(0x23);
4231   emit_operand(dst, src);
4232 }
4233 
4234 void Assembler::andq(Register dst, Register src) {
4235   (int) prefixq_and_encode(dst->encoding(), src->encoding());
4236   emit_arith(0x23, 0xC0, dst, src);
4237 }
4238 
4239 void Assembler::bsfq(Register dst, Register src) {
4240   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4241   emit_byte(0x0F);
4242   emit_byte(0xBC);
4243   emit_byte(0xC0 | encode);
4244 }
4245 
4246 void Assembler::bsrq(Register dst, Register src) {
4247   assert(!VM_Version::supports_lzcnt(), "encoding is treated as LZCNT");
4248   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4249   emit_byte(0x0F);
4250   emit_byte(0xBD);
4251   emit_byte(0xC0 | encode);
4252 }
4253 
4254 void Assembler::bswapq(Register reg) {
4255   int encode = prefixq_and_encode(reg->encoding());
4256   emit_byte(0x0F);
4257   emit_byte(0xC8 | encode);
4258 }
4259 
4260 void Assembler::cdqq() {
4261   prefix(REX_W);
4262   emit_byte(0x99);
4263 }
4264 
4265 void Assembler::clflush(Address adr) {
4266   prefix(adr);
4267   emit_byte(0x0F);
4268   emit_byte(0xAE);
4269   emit_operand(rdi, adr);
4270 }
4271 
4272 void Assembler::cmovq(Condition cc, Register dst, Register src) {
4273   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4274   emit_byte(0x0F);
4275   emit_byte(0x40 | cc);
4276   emit_byte(0xC0 | encode);
4277 }
4278 
4279 void Assembler::cmovq(Condition cc, Register dst, Address src) {
4280   InstructionMark im(this);
4281   prefixq(src, dst);
4282   emit_byte(0x0F);
4283   emit_byte(0x40 | cc);
4284   emit_operand(dst, src);
4285 }
4286 
4287 void Assembler::cmpq(Address dst, int32_t imm32) {
4288   InstructionMark im(this);
4289   prefixq(dst);
4290   emit_byte(0x81);
4291   emit_operand(rdi, dst, 4);
4292   emit_long(imm32);
4293 }
4294 
4295 void Assembler::cmpq(Register dst, int32_t imm32) {
4296   (void) prefixq_and_encode(dst->encoding());
4297   emit_arith(0x81, 0xF8, dst, imm32);
4298 }
4299 
4300 void Assembler::cmpq(Address dst, Register src) {
4301   InstructionMark im(this);
4302   prefixq(dst, src);
4303   emit_byte(0x3B);
4304   emit_operand(src, dst);
4305 }
4306 
4307 void Assembler::cmpq(Register dst, Register src) {
4308   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4309   emit_arith(0x3B, 0xC0, dst, src);
4310 }
4311 
4312 void Assembler::cmpq(Register dst, Address  src) {
4313   InstructionMark im(this);
4314   prefixq(src, dst);
4315   emit_byte(0x3B);
4316   emit_operand(dst, src);
4317 }
4318 
4319 void Assembler::cmpxchgq(Register reg, Address adr) {
4320   InstructionMark im(this);
4321   prefixq(adr, reg);
4322   emit_byte(0x0F);
4323   emit_byte(0xB1);
4324   emit_operand(reg, adr);
4325 }
4326 
4327 void Assembler::cvtsi2sdq(XMMRegister dst, Register src) {
4328   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4329   int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F2);
4330   emit_byte(0x2A);
4331   emit_byte(0xC0 | encode);
4332 }
4333 
4334 void Assembler::cvtsi2sdq(XMMRegister dst, Address src) {
4335   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4336   InstructionMark im(this);
4337   simd_prefix_q(dst, dst, src, VEX_SIMD_F2);
4338   emit_byte(0x2A);
4339   emit_operand(dst, src);
4340 }
4341 
4342 void Assembler::cvtsi2ssq(XMMRegister dst, Register src) {
4343   NOT_LP64(assert(VM_Version::supports_sse(), ""));
4344   int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3);
4345   emit_byte(0x2A);
4346   emit_byte(0xC0 | encode);
4347 }
4348 
4349 void Assembler::cvtsi2ssq(XMMRegister dst, Address src) {
4350   NOT_LP64(assert(VM_Version::supports_sse(), ""));
4351   InstructionMark im(this);
4352   simd_prefix_q(dst, dst, src, VEX_SIMD_F3);
4353   emit_byte(0x2A);
4354   emit_operand(dst, src);
4355 }
4356 
4357 void Assembler::cvttsd2siq(Register dst, XMMRegister src) {
4358   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4359   int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F2);
4360   emit_byte(0x2C);
4361   emit_byte(0xC0 | encode);
4362 }
4363 
4364 void Assembler::cvttss2siq(Register dst, XMMRegister src) {
4365   NOT_LP64(assert(VM_Version::supports_sse(), ""));
4366   int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F3);
4367   emit_byte(0x2C);
4368   emit_byte(0xC0 | encode);
4369 }
4370 
4371 void Assembler::decl(Register dst) {
4372   // Don't use it directly. Use MacroAssembler::decrementl() instead.
4373   // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
4374   int encode = prefix_and_encode(dst->encoding());
4375   emit_byte(0xFF);
4376   emit_byte(0xC8 | encode);
4377 }
4378 
4379 void Assembler::decq(Register dst) {
4380   // Don't use it directly. Use MacroAssembler::decrementq() instead.
4381   // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
4382   int encode = prefixq_and_encode(dst->encoding());
4383   emit_byte(0xFF);
4384   emit_byte(0xC8 | encode);
4385 }
4386 
4387 void Assembler::decq(Address dst) {
4388   // Don't use it directly. Use MacroAssembler::decrementq() instead.
4389   InstructionMark im(this);
4390   prefixq(dst);
4391   emit_byte(0xFF);
4392   emit_operand(rcx, dst);
4393 }
4394 
4395 void Assembler::fxrstor(Address src) {
4396   prefixq(src);
4397   emit_byte(0x0F);
4398   emit_byte(0xAE);
4399   emit_operand(as_Register(1), src);
4400 }
4401 
4402 void Assembler::fxsave(Address dst) {
4403   prefixq(dst);
4404   emit_byte(0x0F);
4405   emit_byte(0xAE);
4406   emit_operand(as_Register(0), dst);
4407 }
4408 
4409 void Assembler::idivq(Register src) {
4410   int encode = prefixq_and_encode(src->encoding());
4411   emit_byte(0xF7);
4412   emit_byte(0xF8 | encode);
4413 }
4414 
4415 void Assembler::imulq(Register dst, Register src) {
4416   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4417   emit_byte(0x0F);
4418   emit_byte(0xAF);
4419   emit_byte(0xC0 | encode);
4420 }
4421 
4422 void Assembler::imulq(Register dst, Register src, int value) {
4423   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4424   if (is8bit(value)) {
4425     emit_byte(0x6B);
4426     emit_byte(0xC0 | encode);
4427     emit_byte(value & 0xFF);
4428   } else {
4429     emit_byte(0x69);
4430     emit_byte(0xC0 | encode);
4431     emit_long(value);
4432   }
4433 }
4434 
4435 void Assembler::incl(Register dst) {
4436   // Don't use it directly. Use MacroAssembler::incrementl() instead.
4437   // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
4438   int encode = prefix_and_encode(dst->encoding());
4439   emit_byte(0xFF);
4440   emit_byte(0xC0 | encode);
4441 }
4442 
4443 void Assembler::incq(Register dst) {
4444   // Don't use it directly. Use MacroAssembler::incrementq() instead.
4445   // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
4446   int encode = prefixq_and_encode(dst->encoding());
4447   emit_byte(0xFF);
4448   emit_byte(0xC0 | encode);
4449 }
4450 
4451 void Assembler::incq(Address dst) {
4452   // Don't use it directly. Use MacroAssembler::incrementq() instead.
4453   InstructionMark im(this);
4454   prefixq(dst);
4455   emit_byte(0xFF);
4456   emit_operand(rax, dst);
4457 }
4458 
4459 void Assembler::lea(Register dst, Address src) {
4460   leaq(dst, src);
4461 }
4462 
4463 void Assembler::leaq(Register dst, Address src) {
4464   InstructionMark im(this);
4465   prefixq(src, dst);
4466   emit_byte(0x8D);
4467   emit_operand(dst, src);
4468 }
4469 
4470 void Assembler::mov64(Register dst, int64_t imm64) {
4471   InstructionMark im(this);
4472   int encode = prefixq_and_encode(dst->encoding());
4473   emit_byte(0xB8 | encode);
4474   emit_long64(imm64);
4475 }
4476 
4477 void Assembler::mov_literal64(Register dst, intptr_t imm64, RelocationHolder const& rspec) {
4478   InstructionMark im(this);
4479   int encode = prefixq_and_encode(dst->encoding());
4480   emit_byte(0xB8 | encode);
4481   emit_data64(imm64, rspec);
4482 }
4483 
4484 void Assembler::mov_narrow_oop(Register dst, int32_t imm32, RelocationHolder const& rspec) {
4485   InstructionMark im(this);
4486   int encode = prefix_and_encode(dst->encoding());
4487   emit_byte(0xB8 | encode);
4488   emit_data((int)imm32, rspec, narrow_oop_operand);
4489 }
4490 
4491 void Assembler::mov_narrow_oop(Address dst, int32_t imm32,  RelocationHolder const& rspec) {
4492   InstructionMark im(this);
4493   prefix(dst);
4494   emit_byte(0xC7);
4495   emit_operand(rax, dst, 4);
4496   emit_data((int)imm32, rspec, narrow_oop_operand);
4497 }
4498 
4499 void Assembler::cmp_narrow_oop(Register src1, int32_t imm32, RelocationHolder const& rspec) {
4500   InstructionMark im(this);
4501   int encode = prefix_and_encode(src1->encoding());
4502   emit_byte(0x81);
4503   emit_byte(0xF8 | encode);
4504   emit_data((int)imm32, rspec, narrow_oop_operand);
4505 }
4506 
4507 void Assembler::cmp_narrow_oop(Address src1, int32_t imm32, RelocationHolder const& rspec) {
4508   InstructionMark im(this);
4509   prefix(src1);
4510   emit_byte(0x81);
4511   emit_operand(rax, src1, 4);
4512   emit_data((int)imm32, rspec, narrow_oop_operand);
4513 }
4514 
4515 void Assembler::lzcntq(Register dst, Register src) {
4516   assert(VM_Version::supports_lzcnt(), "encoding is treated as BSR");
4517   emit_byte(0xF3);
4518   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4519   emit_byte(0x0F);
4520   emit_byte(0xBD);
4521   emit_byte(0xC0 | encode);
4522 }
4523 
4524 void Assembler::movdq(XMMRegister dst, Register src) {
4525   // table D-1 says MMX/SSE2
4526   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4527   int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_66);
4528   emit_byte(0x6E);
4529   emit_byte(0xC0 | encode);
4530 }
4531 
4532 void Assembler::movdq(Register dst, XMMRegister src) {
4533   // table D-1 says MMX/SSE2
4534   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4535   // swap src/dst to get correct prefix
4536   int encode = simd_prefix_and_encode_q(src, dst, VEX_SIMD_66);
4537   emit_byte(0x7E);
4538   emit_byte(0xC0 | encode);
4539 }
4540 
4541 void Assembler::movq(Register dst, Register src) {
4542   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4543   emit_byte(0x8B);
4544   emit_byte(0xC0 | encode);
4545 }
4546 
4547 void Assembler::movq(Register dst, Address src) {
4548   InstructionMark im(this);
4549   prefixq(src, dst);
4550   emit_byte(0x8B);
4551   emit_operand(dst, src);
4552 }
4553 
4554 void Assembler::movq(Address dst, Register src) {
4555   InstructionMark im(this);
4556   prefixq(dst, src);
4557   emit_byte(0x89);
4558   emit_operand(src, dst);
4559 }
4560 
4561 void Assembler::movsbq(Register dst, Address src) {
4562   InstructionMark im(this);
4563   prefixq(src, dst);
4564   emit_byte(0x0F);
4565   emit_byte(0xBE);
4566   emit_operand(dst, src);
4567 }
4568 
4569 void Assembler::movsbq(Register dst, Register src) {
4570   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4571   emit_byte(0x0F);
4572   emit_byte(0xBE);
4573   emit_byte(0xC0 | encode);
4574 }
4575 
4576 void Assembler::movslq(Register dst, int32_t imm32) {
4577   // dbx shows movslq(rcx, 3) as movq     $0x0000000049000000,(%rbx)
4578   // and movslq(r8, 3); as movl     $0x0000000048000000,(%rbx)
4579   // as a result we shouldn't use until tested at runtime...
4580   ShouldNotReachHere();
4581   InstructionMark im(this);
4582   int encode = prefixq_and_encode(dst->encoding());
4583   emit_byte(0xC7 | encode);
4584   emit_long(imm32);
4585 }
4586 
4587 void Assembler::movslq(Address dst, int32_t imm32) {
4588   assert(is_simm32(imm32), "lost bits");
4589   InstructionMark im(this);
4590   prefixq(dst);
4591   emit_byte(0xC7);
4592   emit_operand(rax, dst, 4);
4593   emit_long(imm32);
4594 }
4595 
4596 void Assembler::movslq(Register dst, Address src) {
4597   InstructionMark im(this);
4598   prefixq(src, dst);
4599   emit_byte(0x63);
4600   emit_operand(dst, src);
4601 }
4602 
4603 void Assembler::movslq(Register dst, Register src) {
4604   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4605   emit_byte(0x63);
4606   emit_byte(0xC0 | encode);
4607 }
4608 
4609 void Assembler::movswq(Register dst, Address src) {
4610   InstructionMark im(this);
4611   prefixq(src, dst);
4612   emit_byte(0x0F);
4613   emit_byte(0xBF);
4614   emit_operand(dst, src);
4615 }
4616 
4617 void Assembler::movswq(Register dst, Register src) {
4618   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4619   emit_byte(0x0F);
4620   emit_byte(0xBF);
4621   emit_byte(0xC0 | encode);
4622 }
4623 
4624 void Assembler::movzbq(Register dst, Address src) {
4625   InstructionMark im(this);
4626   prefixq(src, dst);
4627   emit_byte(0x0F);
4628   emit_byte(0xB6);
4629   emit_operand(dst, src);
4630 }
4631 
4632 void Assembler::movzbq(Register dst, Register src) {
4633   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4634   emit_byte(0x0F);
4635   emit_byte(0xB6);
4636   emit_byte(0xC0 | encode);
4637 }
4638 
4639 void Assembler::movzwq(Register dst, Address src) {
4640   InstructionMark im(this);
4641   prefixq(src, dst);
4642   emit_byte(0x0F);
4643   emit_byte(0xB7);
4644   emit_operand(dst, src);
4645 }
4646 
4647 void Assembler::movzwq(Register dst, Register src) {
4648   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4649   emit_byte(0x0F);
4650   emit_byte(0xB7);
4651   emit_byte(0xC0 | encode);
4652 }
4653 
4654 void Assembler::negq(Register dst) {
4655   int encode = prefixq_and_encode(dst->encoding());
4656   emit_byte(0xF7);
4657   emit_byte(0xD8 | encode);
4658 }
4659 
4660 void Assembler::notq(Register dst) {
4661   int encode = prefixq_and_encode(dst->encoding());
4662   emit_byte(0xF7);
4663   emit_byte(0xD0 | encode);
4664 }
4665 
4666 void Assembler::orq(Address dst, int32_t imm32) {
4667   InstructionMark im(this);
4668   prefixq(dst);
4669   emit_byte(0x81);
4670   emit_operand(rcx, dst, 4);
4671   emit_long(imm32);
4672 }
4673 
4674 void Assembler::orq(Register dst, int32_t imm32) {
4675   (void) prefixq_and_encode(dst->encoding());
4676   emit_arith(0x81, 0xC8, dst, imm32);
4677 }
4678 
4679 void Assembler::orq(Register dst, Address src) {
4680   InstructionMark im(this);
4681   prefixq(src, dst);
4682   emit_byte(0x0B);
4683   emit_operand(dst, src);
4684 }
4685 
4686 void Assembler::orq(Register dst, Register src) {
4687   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4688   emit_arith(0x0B, 0xC0, dst, src);
4689 }
4690 
4691 void Assembler::popa() { // 64bit
4692   movq(r15, Address(rsp, 0));
4693   movq(r14, Address(rsp, wordSize));
4694   movq(r13, Address(rsp, 2 * wordSize));
4695   movq(r12, Address(rsp, 3 * wordSize));
4696   movq(r11, Address(rsp, 4 * wordSize));
4697   movq(r10, Address(rsp, 5 * wordSize));
4698   movq(r9,  Address(rsp, 6 * wordSize));
4699   movq(r8,  Address(rsp, 7 * wordSize));
4700   movq(rdi, Address(rsp, 8 * wordSize));
4701   movq(rsi, Address(rsp, 9 * wordSize));
4702   movq(rbp, Address(rsp, 10 * wordSize));
4703   // skip rsp
4704   movq(rbx, Address(rsp, 12 * wordSize));
4705   movq(rdx, Address(rsp, 13 * wordSize));
4706   movq(rcx, Address(rsp, 14 * wordSize));
4707   movq(rax, Address(rsp, 15 * wordSize));
4708 
4709   addq(rsp, 16 * wordSize);
4710 }
4711 
4712 void Assembler::popcntq(Register dst, Address src) {
4713   assert(VM_Version::supports_popcnt(), "must support");
4714   InstructionMark im(this);
4715   emit_byte(0xF3);
4716   prefixq(src, dst);
4717   emit_byte(0x0F);
4718   emit_byte(0xB8);
4719   emit_operand(dst, src);
4720 }
4721 
4722 void Assembler::popcntq(Register dst, Register src) {
4723   assert(VM_Version::supports_popcnt(), "must support");
4724   emit_byte(0xF3);
4725   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4726   emit_byte(0x0F);
4727   emit_byte(0xB8);
4728   emit_byte(0xC0 | encode);
4729 }
4730 
4731 void Assembler::popq(Address dst) {
4732   InstructionMark im(this);
4733   prefixq(dst);
4734   emit_byte(0x8F);
4735   emit_operand(rax, dst);
4736 }
4737 
4738 void Assembler::pusha() { // 64bit
4739   // we have to store original rsp.  ABI says that 128 bytes
4740   // below rsp are local scratch.
4741   movq(Address(rsp, -5 * wordSize), rsp);
4742 
4743   subq(rsp, 16 * wordSize);
4744 
4745   movq(Address(rsp, 15 * wordSize), rax);
4746   movq(Address(rsp, 14 * wordSize), rcx);
4747   movq(Address(rsp, 13 * wordSize), rdx);
4748   movq(Address(rsp, 12 * wordSize), rbx);
4749   // skip rsp
4750   movq(Address(rsp, 10 * wordSize), rbp);
4751   movq(Address(rsp, 9 * wordSize), rsi);
4752   movq(Address(rsp, 8 * wordSize), rdi);
4753   movq(Address(rsp, 7 * wordSize), r8);
4754   movq(Address(rsp, 6 * wordSize), r9);
4755   movq(Address(rsp, 5 * wordSize), r10);
4756   movq(Address(rsp, 4 * wordSize), r11);
4757   movq(Address(rsp, 3 * wordSize), r12);
4758   movq(Address(rsp, 2 * wordSize), r13);
4759   movq(Address(rsp, wordSize), r14);
4760   movq(Address(rsp, 0), r15);
4761 }
4762 
4763 void Assembler::pushq(Address src) {
4764   InstructionMark im(this);
4765   prefixq(src);
4766   emit_byte(0xFF);
4767   emit_operand(rsi, src);
4768 }
4769 
4770 void Assembler::rclq(Register dst, int imm8) {
4771   assert(isShiftCount(imm8 >> 1), "illegal shift count");
4772   int encode = prefixq_and_encode(dst->encoding());
4773   if (imm8 == 1) {
4774     emit_byte(0xD1);
4775     emit_byte(0xD0 | encode);
4776   } else {
4777     emit_byte(0xC1);
4778     emit_byte(0xD0 | encode);
4779     emit_byte(imm8);
4780   }
4781 }
4782 void Assembler::sarq(Register dst, int imm8) {
4783   assert(isShiftCount(imm8 >> 1), "illegal shift count");
4784   int encode = prefixq_and_encode(dst->encoding());
4785   if (imm8 == 1) {
4786     emit_byte(0xD1);
4787     emit_byte(0xF8 | encode);
4788   } else {
4789     emit_byte(0xC1);
4790     emit_byte(0xF8 | encode);
4791     emit_byte(imm8);
4792   }
4793 }
4794 
4795 void Assembler::sarq(Register dst) {
4796   int encode = prefixq_and_encode(dst->encoding());
4797   emit_byte(0xD3);
4798   emit_byte(0xF8 | encode);
4799 }
4800 
4801 void Assembler::sbbq(Address dst, int32_t imm32) {
4802   InstructionMark im(this);
4803   prefixq(dst);
4804   emit_arith_operand(0x81, rbx, dst, imm32);
4805 }
4806 
4807 void Assembler::sbbq(Register dst, int32_t imm32) {
4808   (void) prefixq_and_encode(dst->encoding());
4809   emit_arith(0x81, 0xD8, dst, imm32);
4810 }
4811 
4812 void Assembler::sbbq(Register dst, Address src) {
4813   InstructionMark im(this);
4814   prefixq(src, dst);
4815   emit_byte(0x1B);
4816   emit_operand(dst, src);
4817 }
4818 
4819 void Assembler::sbbq(Register dst, Register src) {
4820   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4821   emit_arith(0x1B, 0xC0, dst, src);
4822 }
4823 
4824 void Assembler::shlq(Register dst, int imm8) {
4825   assert(isShiftCount(imm8 >> 1), "illegal shift count");
4826   int encode = prefixq_and_encode(dst->encoding());
4827   if (imm8 == 1) {
4828     emit_byte(0xD1);
4829     emit_byte(0xE0 | encode);
4830   } else {
4831     emit_byte(0xC1);
4832     emit_byte(0xE0 | encode);
4833     emit_byte(imm8);
4834   }
4835 }
4836 
4837 void Assembler::shlq(Register dst) {
4838   int encode = prefixq_and_encode(dst->encoding());
4839   emit_byte(0xD3);
4840   emit_byte(0xE0 | encode);
4841 }
4842 
4843 void Assembler::shrq(Register dst, int imm8) {
4844   assert(isShiftCount(imm8 >> 1), "illegal shift count");
4845   int encode = prefixq_and_encode(dst->encoding());
4846   emit_byte(0xC1);
4847   emit_byte(0xE8 | encode);
4848   emit_byte(imm8);
4849 }
4850 
4851 void Assembler::shrq(Register dst) {
4852   int encode = prefixq_and_encode(dst->encoding());
4853   emit_byte(0xD3);
4854   emit_byte(0xE8 | encode);
4855 }
4856 
4857 void Assembler::subq(Address dst, int32_t imm32) {
4858   InstructionMark im(this);
4859   prefixq(dst);
4860   emit_arith_operand(0x81, rbp, dst, imm32);
4861 }
4862 
4863 void Assembler::subq(Address dst, Register src) {
4864   InstructionMark im(this);
4865   prefixq(dst, src);
4866   emit_byte(0x29);
4867   emit_operand(src, dst);
4868 }
4869 
4870 void Assembler::subq(Register dst, int32_t imm32) {
4871   (void) prefixq_and_encode(dst->encoding());
4872   emit_arith(0x81, 0xE8, dst, imm32);
4873 }
4874 
4875 // Force generation of a 4 byte immediate value even if it fits into 8bit
4876 void Assembler::subq_imm32(Register dst, int32_t imm32) {
4877   (void) prefixq_and_encode(dst->encoding());
4878   emit_arith_imm32(0x81, 0xE8, dst, imm32);
4879 }
4880 
4881 void Assembler::subq(Register dst, Address src) {
4882   InstructionMark im(this);
4883   prefixq(src, dst);
4884   emit_byte(0x2B);
4885   emit_operand(dst, src);
4886 }
4887 
4888 void Assembler::subq(Register dst, Register src) {
4889   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4890   emit_arith(0x2B, 0xC0, dst, src);
4891 }
4892 
4893 void Assembler::testq(Register dst, int32_t imm32) {
4894   // not using emit_arith because test
4895   // doesn't support sign-extension of
4896   // 8bit operands
4897   int encode = dst->encoding();
4898   if (encode == 0) {
4899     prefix(REX_W);
4900     emit_byte(0xA9);
4901   } else {
4902     encode = prefixq_and_encode(encode);
4903     emit_byte(0xF7);
4904     emit_byte(0xC0 | encode);
4905   }
4906   emit_long(imm32);
4907 }
4908 
4909 void Assembler::testq(Register dst, Register src) {
4910   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4911   emit_arith(0x85, 0xC0, dst, src);
4912 }
4913 
4914 void Assembler::xaddq(Address dst, Register src) {
4915   InstructionMark im(this);
4916   prefixq(dst, src);
4917   emit_byte(0x0F);
4918   emit_byte(0xC1);
4919   emit_operand(src, dst);
4920 }
4921 
4922 void Assembler::xchgq(Register dst, Address src) {
4923   InstructionMark im(this);
4924   prefixq(src, dst);
4925   emit_byte(0x87);
4926   emit_operand(dst, src);
4927 }
4928 
4929 void Assembler::xchgq(Register dst, Register src) {
4930   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4931   emit_byte(0x87);
4932   emit_byte(0xc0 | encode);
4933 }
4934 
4935 void Assembler::xorq(Register dst, Register src) {
4936   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4937   emit_arith(0x33, 0xC0, dst, src);
4938 }
4939 
4940 void Assembler::xorq(Register dst, Address src) {
4941   InstructionMark im(this);
4942   prefixq(src, dst);
4943   emit_byte(0x33);
4944   emit_operand(dst, src);
4945 }
4946 
4947 #endif // !LP64
4948 
4949 static Assembler::Condition reverse[] = {
4950     Assembler::noOverflow     /* overflow      = 0x0 */ ,
4951     Assembler::overflow       /* noOverflow    = 0x1 */ ,
4952     Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
4953     Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
4954     Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
4955     Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
4956     Assembler::above          /* belowEqual    = 0x6 */ ,
4957     Assembler::belowEqual     /* above         = 0x7 */ ,
4958     Assembler::positive       /* negative      = 0x8 */ ,
4959     Assembler::negative       /* positive      = 0x9 */ ,
4960     Assembler::noParity       /* parity        = 0xa */ ,
4961     Assembler::parity         /* noParity      = 0xb */ ,
4962     Assembler::greaterEqual   /* less          = 0xc */ ,
4963     Assembler::less           /* greaterEqual  = 0xd */ ,
4964     Assembler::greater        /* lessEqual     = 0xe */ ,
4965     Assembler::lessEqual      /* greater       = 0xf, */
4966 
4967 };
4968 
4969 
4970 // Implementation of MacroAssembler
4971 
4972 // First all the versions that have distinct versions depending on 32/64 bit
4973 // Unless the difference is trivial (1 line or so).
4974 
4975 #ifndef _LP64
4976 
4977 // 32bit versions
4978 
4979 Address MacroAssembler::as_Address(AddressLiteral adr) {
4980   return Address(adr.target(), adr.rspec());
4981 }
4982 
4983 Address MacroAssembler::as_Address(ArrayAddress adr) {
4984   return Address::make_array(adr);
4985 }
4986 
4987 int MacroAssembler::biased_locking_enter(Register lock_reg,
4988                                          Register obj_reg,
4989                                          Register swap_reg,
4990                                          Register tmp_reg,
4991                                          bool swap_reg_contains_mark,
4992                                          Label& done,
4993                                          Label* slow_case,
4994                                          BiasedLockingCounters* counters) {
4995   assert(UseBiasedLocking, "why call this otherwise?");
4996   assert(swap_reg == rax, "swap_reg must be rax, for cmpxchg");
4997   assert_different_registers(lock_reg, obj_reg, swap_reg);
4998 
4999   if (PrintBiasedLockingStatistics && counters == NULL)
5000     counters = BiasedLocking::counters();
5001 
5002   bool need_tmp_reg = false;
5003   if (tmp_reg == noreg) {
5004     need_tmp_reg = true;
5005     tmp_reg = lock_reg;
5006   } else {
5007     assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
5008   }
5009   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
5010   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
5011   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
5012   Address saved_mark_addr(lock_reg, 0);
5013 
5014   // Biased locking
5015   // See whether the lock is currently biased toward our thread and
5016   // whether the epoch is still valid
5017   // Note that the runtime guarantees sufficient alignment of JavaThread
5018   // pointers to allow age to be placed into low bits
5019   // First check to see whether biasing is even enabled for this object
5020   Label cas_label;
5021   int null_check_offset = -1;
5022   if (!swap_reg_contains_mark) {
5023     null_check_offset = offset();
5024     movl(swap_reg, mark_addr);
5025   }
5026   if (need_tmp_reg) {
5027     push(tmp_reg);
5028   }
5029   movl(tmp_reg, swap_reg);
5030   andl(tmp_reg, markOopDesc::biased_lock_mask_in_place);
5031   cmpl(tmp_reg, markOopDesc::biased_lock_pattern);
5032   if (need_tmp_reg) {
5033     pop(tmp_reg);
5034   }
5035   jcc(Assembler::notEqual, cas_label);
5036   // The bias pattern is present in the object's header. Need to check
5037   // whether the bias owner and the epoch are both still current.
5038   // Note that because there is no current thread register on x86 we
5039   // need to store off the mark word we read out of the object to
5040   // avoid reloading it and needing to recheck invariants below. This
5041   // store is unfortunate but it makes the overall code shorter and
5042   // simpler.
5043   movl(saved_mark_addr, swap_reg);
5044   if (need_tmp_reg) {
5045     push(tmp_reg);
5046   }
5047   get_thread(tmp_reg);
5048   xorl(swap_reg, tmp_reg);
5049   if (swap_reg_contains_mark) {
5050     null_check_offset = offset();
5051   }
5052   movl(tmp_reg, klass_addr);
5053   xorl(swap_reg, Address(tmp_reg, Klass::prototype_header_offset()));
5054   andl(swap_reg, ~((int) markOopDesc::age_mask_in_place));
5055   if (need_tmp_reg) {
5056     pop(tmp_reg);
5057   }
5058   if (counters != NULL) {
5059     cond_inc32(Assembler::zero,
5060                ExternalAddress((address)counters->biased_lock_entry_count_addr()));
5061   }
5062   jcc(Assembler::equal, done);
5063 
5064   Label try_revoke_bias;
5065   Label try_rebias;
5066 
5067   // At this point we know that the header has the bias pattern and
5068   // that we are not the bias owner in the current epoch. We need to
5069   // figure out more details about the state of the header in order to
5070   // know what operations can be legally performed on the object's
5071   // header.
5072 
5073   // If the low three bits in the xor result aren't clear, that means
5074   // the prototype header is no longer biased and we have to revoke
5075   // the bias on this object.
5076   testl(swap_reg, markOopDesc::biased_lock_mask_in_place);
5077   jcc(Assembler::notZero, try_revoke_bias);
5078 
5079   // Biasing is still enabled for this data type. See whether the
5080   // epoch of the current bias is still valid, meaning that the epoch
5081   // bits of the mark word are equal to the epoch bits of the
5082   // prototype header. (Note that the prototype header's epoch bits
5083   // only change at a safepoint.) If not, attempt to rebias the object
5084   // toward the current thread. Note that we must be absolutely sure
5085   // that the current epoch is invalid in order to do this because
5086   // otherwise the manipulations it performs on the mark word are
5087   // illegal.
5088   testl(swap_reg, markOopDesc::epoch_mask_in_place);
5089   jcc(Assembler::notZero, try_rebias);
5090 
5091   // The epoch of the current bias is still valid but we know nothing
5092   // about the owner; it might be set or it might be clear. Try to
5093   // acquire the bias of the object using an atomic operation. If this
5094   // fails we will go in to the runtime to revoke the object's bias.
5095   // Note that we first construct the presumed unbiased header so we
5096   // don't accidentally blow away another thread's valid bias.
5097   movl(swap_reg, saved_mark_addr);
5098   andl(swap_reg,
5099        markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
5100   if (need_tmp_reg) {
5101     push(tmp_reg);
5102   }
5103   get_thread(tmp_reg);
5104   orl(tmp_reg, swap_reg);
5105   if (os::is_MP()) {
5106     lock();
5107   }
5108   cmpxchgptr(tmp_reg, Address(obj_reg, 0));
5109   if (need_tmp_reg) {
5110     pop(tmp_reg);
5111   }
5112   // If the biasing toward our thread failed, this means that
5113   // another thread succeeded in biasing it toward itself and we
5114   // need to revoke that bias. The revocation will occur in the
5115   // interpreter runtime in the slow case.
5116   if (counters != NULL) {
5117     cond_inc32(Assembler::zero,
5118                ExternalAddress((address)counters->anonymously_biased_lock_entry_count_addr()));
5119   }
5120   if (slow_case != NULL) {
5121     jcc(Assembler::notZero, *slow_case);
5122   }
5123   jmp(done);
5124 
5125   bind(try_rebias);
5126   // At this point we know the epoch has expired, meaning that the
5127   // current "bias owner", if any, is actually invalid. Under these
5128   // circumstances _only_, we are allowed to use the current header's
5129   // value as the comparison value when doing the cas to acquire the
5130   // bias in the current epoch. In other words, we allow transfer of
5131   // the bias from one thread to another directly in this situation.
5132   //
5133   // FIXME: due to a lack of registers we currently blow away the age
5134   // bits in this situation. Should attempt to preserve them.
5135   if (need_tmp_reg) {
5136     push(tmp_reg);
5137   }
5138   get_thread(tmp_reg);
5139   movl(swap_reg, klass_addr);
5140   orl(tmp_reg, Address(swap_reg, Klass::prototype_header_offset()));
5141   movl(swap_reg, saved_mark_addr);
5142   if (os::is_MP()) {
5143     lock();
5144   }
5145   cmpxchgptr(tmp_reg, Address(obj_reg, 0));
5146   if (need_tmp_reg) {
5147     pop(tmp_reg);
5148   }
5149   // If the biasing toward our thread failed, then another thread
5150   // succeeded in biasing it toward itself and we need to revoke that
5151   // bias. The revocation will occur in the runtime in the slow case.
5152   if (counters != NULL) {
5153     cond_inc32(Assembler::zero,
5154                ExternalAddress((address)counters->rebiased_lock_entry_count_addr()));
5155   }
5156   if (slow_case != NULL) {
5157     jcc(Assembler::notZero, *slow_case);
5158   }
5159   jmp(done);
5160 
5161   bind(try_revoke_bias);
5162   // The prototype mark in the klass doesn't have the bias bit set any
5163   // more, indicating that objects of this data type are not supposed
5164   // to be biased any more. We are going to try to reset the mark of
5165   // this object to the prototype value and fall through to the
5166   // CAS-based locking scheme. Note that if our CAS fails, it means
5167   // that another thread raced us for the privilege of revoking the
5168   // bias of this particular object, so it's okay to continue in the
5169   // normal locking code.
5170   //
5171   // FIXME: due to a lack of registers we currently blow away the age
5172   // bits in this situation. Should attempt to preserve them.
5173   movl(swap_reg, saved_mark_addr);
5174   if (need_tmp_reg) {
5175     push(tmp_reg);
5176   }
5177   movl(tmp_reg, klass_addr);
5178   movl(tmp_reg, Address(tmp_reg, Klass::prototype_header_offset()));
5179   if (os::is_MP()) {
5180     lock();
5181   }
5182   cmpxchgptr(tmp_reg, Address(obj_reg, 0));
5183   if (need_tmp_reg) {
5184     pop(tmp_reg);
5185   }
5186   // Fall through to the normal CAS-based lock, because no matter what
5187   // the result of the above CAS, some thread must have succeeded in
5188   // removing the bias bit from the object's header.
5189   if (counters != NULL) {
5190     cond_inc32(Assembler::zero,
5191                ExternalAddress((address)counters->revoked_lock_entry_count_addr()));
5192   }
5193 
5194   bind(cas_label);
5195 
5196   return null_check_offset;
5197 }
5198 void MacroAssembler::call_VM_leaf_base(address entry_point,
5199                                        int number_of_arguments) {
5200   call(RuntimeAddress(entry_point));
5201   increment(rsp, number_of_arguments * wordSize);
5202 }
5203 
5204 void MacroAssembler::cmpoop(Address src1, jobject obj) {
5205   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
5206 }
5207 
5208 void MacroAssembler::cmpoop(Register src1, jobject obj) {
5209   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
5210 }
5211 
5212 void MacroAssembler::extend_sign(Register hi, Register lo) {
5213   // According to Intel Doc. AP-526, "Integer Divide", p.18.
5214   if (VM_Version::is_P6() && hi == rdx && lo == rax) {
5215     cdql();
5216   } else {
5217     movl(hi, lo);
5218     sarl(hi, 31);
5219   }
5220 }
5221 
5222 void MacroAssembler::jC2(Register tmp, Label& L) {
5223   // set parity bit if FPU flag C2 is set (via rax)
5224   save_rax(tmp);
5225   fwait(); fnstsw_ax();
5226   sahf();
5227   restore_rax(tmp);
5228   // branch
5229   jcc(Assembler::parity, L);
5230 }
5231 
5232 void MacroAssembler::jnC2(Register tmp, Label& L) {
5233   // set parity bit if FPU flag C2 is set (via rax)
5234   save_rax(tmp);
5235   fwait(); fnstsw_ax();
5236   sahf();
5237   restore_rax(tmp);
5238   // branch
5239   jcc(Assembler::noParity, L);
5240 }
5241 
5242 // 32bit can do a case table jump in one instruction but we no longer allow the base
5243 // to be installed in the Address class
5244 void MacroAssembler::jump(ArrayAddress entry) {
5245   jmp(as_Address(entry));
5246 }
5247 
5248 // Note: y_lo will be destroyed
5249 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
5250   // Long compare for Java (semantics as described in JVM spec.)
5251   Label high, low, done;
5252 
5253   cmpl(x_hi, y_hi);
5254   jcc(Assembler::less, low);
5255   jcc(Assembler::greater, high);
5256   // x_hi is the return register
5257   xorl(x_hi, x_hi);
5258   cmpl(x_lo, y_lo);
5259   jcc(Assembler::below, low);
5260   jcc(Assembler::equal, done);
5261 
5262   bind(high);
5263   xorl(x_hi, x_hi);
5264   increment(x_hi);
5265   jmp(done);
5266 
5267   bind(low);
5268   xorl(x_hi, x_hi);
5269   decrementl(x_hi);
5270 
5271   bind(done);
5272 }
5273 
5274 void MacroAssembler::lea(Register dst, AddressLiteral src) {
5275     mov_literal32(dst, (int32_t)src.target(), src.rspec());
5276 }
5277 
5278 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
5279   // leal(dst, as_Address(adr));
5280   // see note in movl as to why we must use a move
5281   mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
5282 }
5283 
5284 void MacroAssembler::leave() {
5285   mov(rsp, rbp);
5286   pop(rbp);
5287 }
5288 
5289 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
5290   // Multiplication of two Java long values stored on the stack
5291   // as illustrated below. Result is in rdx:rax.
5292   //
5293   // rsp ---> [  ??  ] \               \
5294   //            ....    | y_rsp_offset  |
5295   //          [ y_lo ] /  (in bytes)    | x_rsp_offset
5296   //          [ y_hi ]                  | (in bytes)
5297   //            ....                    |
5298   //          [ x_lo ]                 /
5299   //          [ x_hi ]
5300   //            ....
5301   //
5302   // Basic idea: lo(result) = lo(x_lo * y_lo)
5303   //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
5304   Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
5305   Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
5306   Label quick;
5307   // load x_hi, y_hi and check if quick
5308   // multiplication is possible
5309   movl(rbx, x_hi);
5310   movl(rcx, y_hi);
5311   movl(rax, rbx);
5312   orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
5313   jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
5314   // do full multiplication
5315   // 1st step
5316   mull(y_lo);                                    // x_hi * y_lo
5317   movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
5318   // 2nd step
5319   movl(rax, x_lo);
5320   mull(rcx);                                     // x_lo * y_hi
5321   addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
5322   // 3rd step
5323   bind(quick);                                   // note: rbx, = 0 if quick multiply!
5324   movl(rax, x_lo);
5325   mull(y_lo);                                    // x_lo * y_lo
5326   addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
5327 }
5328 
5329 void MacroAssembler::lneg(Register hi, Register lo) {
5330   negl(lo);
5331   adcl(hi, 0);
5332   negl(hi);
5333 }
5334 
5335 void MacroAssembler::lshl(Register hi, Register lo) {
5336   // Java shift left long support (semantics as described in JVM spec., p.305)
5337   // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
5338   // shift value is in rcx !
5339   assert(hi != rcx, "must not use rcx");
5340   assert(lo != rcx, "must not use rcx");
5341   const Register s = rcx;                        // shift count
5342   const int      n = BitsPerWord;
5343   Label L;
5344   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
5345   cmpl(s, n);                                    // if (s < n)
5346   jcc(Assembler::less, L);                       // else (s >= n)
5347   movl(hi, lo);                                  // x := x << n
5348   xorl(lo, lo);
5349   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
5350   bind(L);                                       // s (mod n) < n
5351   shldl(hi, lo);                                 // x := x << s
5352   shll(lo);
5353 }
5354 
5355 
5356 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
5357   // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
5358   // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
5359   assert(hi != rcx, "must not use rcx");
5360   assert(lo != rcx, "must not use rcx");
5361   const Register s = rcx;                        // shift count
5362   const int      n = BitsPerWord;
5363   Label L;
5364   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
5365   cmpl(s, n);                                    // if (s < n)
5366   jcc(Assembler::less, L);                       // else (s >= n)
5367   movl(lo, hi);                                  // x := x >> n
5368   if (sign_extension) sarl(hi, 31);
5369   else                xorl(hi, hi);
5370   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
5371   bind(L);                                       // s (mod n) < n
5372   shrdl(lo, hi);                                 // x := x >> s
5373   if (sign_extension) sarl(hi);
5374   else                shrl(hi);
5375 }
5376 
5377 void MacroAssembler::movoop(Register dst, jobject obj) {
5378   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
5379 }
5380 
5381 void MacroAssembler::movoop(Address dst, jobject obj) {
5382   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
5383 }
5384 
5385 void MacroAssembler::movptr(Register dst, AddressLiteral src) {
5386   if (src.is_lval()) {
5387     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
5388   } else {
5389     movl(dst, as_Address(src));
5390   }
5391 }
5392 
5393 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
5394   movl(as_Address(dst), src);
5395 }
5396 
5397 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
5398   movl(dst, as_Address(src));
5399 }
5400 
5401 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
5402 void MacroAssembler::movptr(Address dst, intptr_t src) {
5403   movl(dst, src);
5404 }
5405 
5406 
5407 void MacroAssembler::pop_callee_saved_registers() {
5408   pop(rcx);
5409   pop(rdx);
5410   pop(rdi);
5411   pop(rsi);
5412 }
5413 
5414 void MacroAssembler::pop_fTOS() {
5415   fld_d(Address(rsp, 0));
5416   addl(rsp, 2 * wordSize);
5417 }
5418 
5419 void MacroAssembler::push_callee_saved_registers() {
5420   push(rsi);
5421   push(rdi);
5422   push(rdx);
5423   push(rcx);
5424 }
5425 
5426 void MacroAssembler::push_fTOS() {
5427   subl(rsp, 2 * wordSize);
5428   fstp_d(Address(rsp, 0));
5429 }
5430 
5431 
5432 void MacroAssembler::pushoop(jobject obj) {
5433   push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
5434 }
5435 
5436 
5437 void MacroAssembler::pushptr(AddressLiteral src) {
5438   if (src.is_lval()) {
5439     push_literal32((int32_t)src.target(), src.rspec());
5440   } else {
5441     pushl(as_Address(src));
5442   }
5443 }
5444 
5445 void MacroAssembler::set_word_if_not_zero(Register dst) {
5446   xorl(dst, dst);
5447   set_byte_if_not_zero(dst);
5448 }
5449 
5450 static void pass_arg0(MacroAssembler* masm, Register arg) {
5451   masm->push(arg);
5452 }
5453 
5454 static void pass_arg1(MacroAssembler* masm, Register arg) {
5455   masm->push(arg);
5456 }
5457 
5458 static void pass_arg2(MacroAssembler* masm, Register arg) {
5459   masm->push(arg);
5460 }
5461 
5462 static void pass_arg3(MacroAssembler* masm, Register arg) {
5463   masm->push(arg);
5464 }
5465 
5466 #ifndef PRODUCT
5467 extern "C" void findpc(intptr_t x);
5468 #endif
5469 
5470 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
5471   // In order to get locks to work, we need to fake a in_VM state
5472   JavaThread* thread = JavaThread::current();
5473   JavaThreadState saved_state = thread->thread_state();
5474   thread->set_thread_state(_thread_in_vm);
5475   if (ShowMessageBoxOnError) {
5476     JavaThread* thread = JavaThread::current();
5477     JavaThreadState saved_state = thread->thread_state();
5478     thread->set_thread_state(_thread_in_vm);
5479     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
5480       ttyLocker ttyl;
5481       BytecodeCounter::print();
5482     }
5483     // To see where a verify_oop failed, get $ebx+40/X for this frame.
5484     // This is the value of eip which points to where verify_oop will return.
5485     if (os::message_box(msg, "Execution stopped, print registers?")) {
5486       ttyLocker ttyl;
5487       tty->print_cr("eip = 0x%08x", eip);
5488 #ifndef PRODUCT
5489       if ((WizardMode || Verbose) && PrintMiscellaneous) {
5490         tty->cr();
5491         findpc(eip);
5492         tty->cr();
5493       }
5494 #endif
5495       tty->print_cr("rax = 0x%08x", rax);
5496       tty->print_cr("rbx = 0x%08x", rbx);
5497       tty->print_cr("rcx = 0x%08x", rcx);
5498       tty->print_cr("rdx = 0x%08x", rdx);
5499       tty->print_cr("rdi = 0x%08x", rdi);
5500       tty->print_cr("rsi = 0x%08x", rsi);
5501       tty->print_cr("rbp = 0x%08x", rbp);
5502       tty->print_cr("rsp = 0x%08x", rsp);
5503       BREAKPOINT;
5504       assert(false, "start up GDB");
5505     }
5506   } else {
5507     ttyLocker ttyl;
5508     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
5509     assert(false, err_msg("DEBUG MESSAGE: %s", msg));
5510   }
5511   ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
5512 }
5513 
5514 void MacroAssembler::stop(const char* msg) {
5515   ExternalAddress message((address)msg);
5516   // push address of message
5517   pushptr(message.addr());
5518   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
5519   pusha();                                           // push registers
5520   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
5521   hlt();
5522 }
5523 
5524 void MacroAssembler::warn(const char* msg) {
5525   push_CPU_state();
5526 
5527   ExternalAddress message((address) msg);
5528   // push address of message
5529   pushptr(message.addr());
5530 
5531   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
5532   addl(rsp, wordSize);       // discard argument
5533   pop_CPU_state();
5534 }
5535 
5536 #else // _LP64
5537 
5538 // 64 bit versions
5539 
5540 Address MacroAssembler::as_Address(AddressLiteral adr) {
5541   // amd64 always does this as a pc-rel
5542   // we can be absolute or disp based on the instruction type
5543   // jmp/call are displacements others are absolute
5544   assert(!adr.is_lval(), "must be rval");
5545   assert(reachable(adr), "must be");
5546   return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
5547 
5548 }
5549 
5550 Address MacroAssembler::as_Address(ArrayAddress adr) {
5551   AddressLiteral base = adr.base();
5552   lea(rscratch1, base);
5553   Address index = adr.index();
5554   assert(index._disp == 0, "must not have disp"); // maybe it can?
5555   Address array(rscratch1, index._index, index._scale, index._disp);
5556   return array;
5557 }
5558 
5559 int MacroAssembler::biased_locking_enter(Register lock_reg,
5560                                          Register obj_reg,
5561                                          Register swap_reg,
5562                                          Register tmp_reg,
5563                                          bool swap_reg_contains_mark,
5564                                          Label& done,
5565                                          Label* slow_case,
5566                                          BiasedLockingCounters* counters) {
5567   assert(UseBiasedLocking, "why call this otherwise?");
5568   assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
5569   assert(tmp_reg != noreg, "tmp_reg must be supplied");
5570   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
5571   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
5572   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
5573   Address saved_mark_addr(lock_reg, 0);
5574 
5575   if (PrintBiasedLockingStatistics && counters == NULL)
5576     counters = BiasedLocking::counters();
5577 
5578   // Biased locking
5579   // See whether the lock is currently biased toward our thread and
5580   // whether the epoch is still valid
5581   // Note that the runtime guarantees sufficient alignment of JavaThread
5582   // pointers to allow age to be placed into low bits
5583   // First check to see whether biasing is even enabled for this object
5584   Label cas_label;
5585   int null_check_offset = -1;
5586   if (!swap_reg_contains_mark) {
5587     null_check_offset = offset();
5588     movq(swap_reg, mark_addr);
5589   }
5590   movq(tmp_reg, swap_reg);
5591   andq(tmp_reg, markOopDesc::biased_lock_mask_in_place);
5592   cmpq(tmp_reg, markOopDesc::biased_lock_pattern);
5593   jcc(Assembler::notEqual, cas_label);
5594   // The bias pattern is present in the object's header. Need to check
5595   // whether the bias owner and the epoch are both still current.
5596   load_prototype_header(tmp_reg, obj_reg);
5597   orq(tmp_reg, r15_thread);
5598   xorq(tmp_reg, swap_reg);
5599   andq(tmp_reg, ~((int) markOopDesc::age_mask_in_place));
5600   if (counters != NULL) {
5601     cond_inc32(Assembler::zero,
5602                ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
5603   }
5604   jcc(Assembler::equal, done);
5605 
5606   Label try_revoke_bias;
5607   Label try_rebias;
5608 
5609   // At this point we know that the header has the bias pattern and
5610   // that we are not the bias owner in the current epoch. We need to
5611   // figure out more details about the state of the header in order to
5612   // know what operations can be legally performed on the object's
5613   // header.
5614 
5615   // If the low three bits in the xor result aren't clear, that means
5616   // the prototype header is no longer biased and we have to revoke
5617   // the bias on this object.
5618   testq(tmp_reg, markOopDesc::biased_lock_mask_in_place);
5619   jcc(Assembler::notZero, try_revoke_bias);
5620 
5621   // Biasing is still enabled for this data type. See whether the
5622   // epoch of the current bias is still valid, meaning that the epoch
5623   // bits of the mark word are equal to the epoch bits of the
5624   // prototype header. (Note that the prototype header's epoch bits
5625   // only change at a safepoint.) If not, attempt to rebias the object
5626   // toward the current thread. Note that we must be absolutely sure
5627   // that the current epoch is invalid in order to do this because
5628   // otherwise the manipulations it performs on the mark word are
5629   // illegal.
5630   testq(tmp_reg, markOopDesc::epoch_mask_in_place);
5631   jcc(Assembler::notZero, try_rebias);
5632 
5633   // The epoch of the current bias is still valid but we know nothing
5634   // about the owner; it might be set or it might be clear. Try to
5635   // acquire the bias of the object using an atomic operation. If this
5636   // fails we will go in to the runtime to revoke the object's bias.
5637   // Note that we first construct the presumed unbiased header so we
5638   // don't accidentally blow away another thread's valid bias.
5639   andq(swap_reg,
5640        markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
5641   movq(tmp_reg, swap_reg);
5642   orq(tmp_reg, r15_thread);
5643   if (os::is_MP()) {
5644     lock();
5645   }
5646   cmpxchgq(tmp_reg, Address(obj_reg, 0));
5647   // If the biasing toward our thread failed, this means that
5648   // another thread succeeded in biasing it toward itself and we
5649   // need to revoke that bias. The revocation will occur in the
5650   // interpreter runtime in the slow case.
5651   if (counters != NULL) {
5652     cond_inc32(Assembler::zero,
5653                ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
5654   }
5655   if (slow_case != NULL) {
5656     jcc(Assembler::notZero, *slow_case);
5657   }
5658   jmp(done);
5659 
5660   bind(try_rebias);
5661   // At this point we know the epoch has expired, meaning that the
5662   // current "bias owner", if any, is actually invalid. Under these
5663   // circumstances _only_, we are allowed to use the current header's
5664   // value as the comparison value when doing the cas to acquire the
5665   // bias in the current epoch. In other words, we allow transfer of
5666   // the bias from one thread to another directly in this situation.
5667   //
5668   // FIXME: due to a lack of registers we currently blow away the age
5669   // bits in this situation. Should attempt to preserve them.
5670   load_prototype_header(tmp_reg, obj_reg);
5671   orq(tmp_reg, r15_thread);
5672   if (os::is_MP()) {
5673     lock();
5674   }
5675   cmpxchgq(tmp_reg, Address(obj_reg, 0));
5676   // If the biasing toward our thread failed, then another thread
5677   // succeeded in biasing it toward itself and we need to revoke that
5678   // bias. The revocation will occur in the runtime in the slow case.
5679   if (counters != NULL) {
5680     cond_inc32(Assembler::zero,
5681                ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
5682   }
5683   if (slow_case != NULL) {
5684     jcc(Assembler::notZero, *slow_case);
5685   }
5686   jmp(done);
5687 
5688   bind(try_revoke_bias);
5689   // The prototype mark in the klass doesn't have the bias bit set any
5690   // more, indicating that objects of this data type are not supposed
5691   // to be biased any more. We are going to try to reset the mark of
5692   // this object to the prototype value and fall through to the
5693   // CAS-based locking scheme. Note that if our CAS fails, it means
5694   // that another thread raced us for the privilege of revoking the
5695   // bias of this particular object, so it's okay to continue in the
5696   // normal locking code.
5697   //
5698   // FIXME: due to a lack of registers we currently blow away the age
5699   // bits in this situation. Should attempt to preserve them.
5700   load_prototype_header(tmp_reg, obj_reg);
5701   if (os::is_MP()) {
5702     lock();
5703   }
5704   cmpxchgq(tmp_reg, Address(obj_reg, 0));
5705   // Fall through to the normal CAS-based lock, because no matter what
5706   // the result of the above CAS, some thread must have succeeded in
5707   // removing the bias bit from the object's header.
5708   if (counters != NULL) {
5709     cond_inc32(Assembler::zero,
5710                ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
5711   }
5712 
5713   bind(cas_label);
5714 
5715   return null_check_offset;
5716 }
5717 
5718 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
5719   Label L, E;
5720 
5721 #ifdef _WIN64
5722   // Windows always allocates space for it's register args
5723   assert(num_args <= 4, "only register arguments supported");
5724   subq(rsp,  frame::arg_reg_save_area_bytes);
5725 #endif
5726 
5727   // Align stack if necessary
5728   testl(rsp, 15);
5729   jcc(Assembler::zero, L);
5730 
5731   subq(rsp, 8);
5732   {
5733     call(RuntimeAddress(entry_point));
5734   }
5735   addq(rsp, 8);
5736   jmp(E);
5737 
5738   bind(L);
5739   {
5740     call(RuntimeAddress(entry_point));
5741   }
5742 
5743   bind(E);
5744 
5745 #ifdef _WIN64
5746   // restore stack pointer
5747   addq(rsp, frame::arg_reg_save_area_bytes);
5748 #endif
5749 
5750 }
5751 
5752 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
5753   assert(!src2.is_lval(), "should use cmpptr");
5754 
5755   if (reachable(src2)) {
5756     cmpq(src1, as_Address(src2));
5757   } else {
5758     lea(rscratch1, src2);
5759     Assembler::cmpq(src1, Address(rscratch1, 0));
5760   }
5761 }
5762 
5763 int MacroAssembler::corrected_idivq(Register reg) {
5764   // Full implementation of Java ldiv and lrem; checks for special
5765   // case as described in JVM spec., p.243 & p.271.  The function
5766   // returns the (pc) offset of the idivl instruction - may be needed
5767   // for implicit exceptions.
5768   //
5769   //         normal case                           special case
5770   //
5771   // input : rax: dividend                         min_long
5772   //         reg: divisor   (may not be eax/edx)   -1
5773   //
5774   // output: rax: quotient  (= rax idiv reg)       min_long
5775   //         rdx: remainder (= rax irem reg)       0
5776   assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
5777   static const int64_t min_long = 0x8000000000000000;
5778   Label normal_case, special_case;
5779 
5780   // check for special case
5781   cmp64(rax, ExternalAddress((address) &min_long));
5782   jcc(Assembler::notEqual, normal_case);
5783   xorl(rdx, rdx); // prepare rdx for possible special case (where
5784                   // remainder = 0)
5785   cmpq(reg, -1);
5786   jcc(Assembler::equal, special_case);
5787 
5788   // handle normal case
5789   bind(normal_case);
5790   cdqq();
5791   int idivq_offset = offset();
5792   idivq(reg);
5793 
5794   // normal and special case exit
5795   bind(special_case);
5796 
5797   return idivq_offset;
5798 }
5799 
5800 void MacroAssembler::decrementq(Register reg, int value) {
5801   if (value == min_jint) { subq(reg, value); return; }
5802   if (value <  0) { incrementq(reg, -value); return; }
5803   if (value == 0) {                        ; return; }
5804   if (value == 1 && UseIncDec) { decq(reg) ; return; }
5805   /* else */      { subq(reg, value)       ; return; }
5806 }
5807 
5808 void MacroAssembler::decrementq(Address dst, int value) {
5809   if (value == min_jint) { subq(dst, value); return; }
5810   if (value <  0) { incrementq(dst, -value); return; }
5811   if (value == 0) {                        ; return; }
5812   if (value == 1 && UseIncDec) { decq(dst) ; return; }
5813   /* else */      { subq(dst, value)       ; return; }
5814 }
5815 
5816 void MacroAssembler::incrementq(Register reg, int value) {
5817   if (value == min_jint) { addq(reg, value); return; }
5818   if (value <  0) { decrementq(reg, -value); return; }
5819   if (value == 0) {                        ; return; }
5820   if (value == 1 && UseIncDec) { incq(reg) ; return; }
5821   /* else */      { addq(reg, value)       ; return; }
5822 }
5823 
5824 void MacroAssembler::incrementq(Address dst, int value) {
5825   if (value == min_jint) { addq(dst, value); return; }
5826   if (value <  0) { decrementq(dst, -value); return; }
5827   if (value == 0) {                        ; return; }
5828   if (value == 1 && UseIncDec) { incq(dst) ; return; }
5829   /* else */      { addq(dst, value)       ; return; }
5830 }
5831 
5832 // 32bit can do a case table jump in one instruction but we no longer allow the base
5833 // to be installed in the Address class
5834 void MacroAssembler::jump(ArrayAddress entry) {
5835   lea(rscratch1, entry.base());
5836   Address dispatch = entry.index();
5837   assert(dispatch._base == noreg, "must be");
5838   dispatch._base = rscratch1;
5839   jmp(dispatch);
5840 }
5841 
5842 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
5843   ShouldNotReachHere(); // 64bit doesn't use two regs
5844   cmpq(x_lo, y_lo);
5845 }
5846 
5847 void MacroAssembler::lea(Register dst, AddressLiteral src) {
5848     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
5849 }
5850 
5851 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
5852   mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
5853   movptr(dst, rscratch1);
5854 }
5855 
5856 void MacroAssembler::leave() {
5857   // %%% is this really better? Why not on 32bit too?
5858   emit_byte(0xC9); // LEAVE
5859 }
5860 
5861 void MacroAssembler::lneg(Register hi, Register lo) {
5862   ShouldNotReachHere(); // 64bit doesn't use two regs
5863   negq(lo);
5864 }
5865 
5866 void MacroAssembler::movoop(Register dst, jobject obj) {
5867   mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
5868 }
5869 
5870 void MacroAssembler::movoop(Address dst, jobject obj) {
5871   mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
5872   movq(dst, rscratch1);
5873 }
5874 
5875 void MacroAssembler::movptr(Register dst, AddressLiteral src) {
5876   if (src.is_lval()) {
5877     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
5878   } else {
5879     if (reachable(src)) {
5880       movq(dst, as_Address(src));
5881     } else {
5882       lea(rscratch1, src);
5883       movq(dst, Address(rscratch1,0));
5884     }
5885   }
5886 }
5887 
5888 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
5889   movq(as_Address(dst), src);
5890 }
5891 
5892 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
5893   movq(dst, as_Address(src));
5894 }
5895 
5896 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
5897 void MacroAssembler::movptr(Address dst, intptr_t src) {
5898   mov64(rscratch1, src);
5899   movq(dst, rscratch1);
5900 }
5901 
5902 // These are mostly for initializing NULL
5903 void MacroAssembler::movptr(Address dst, int32_t src) {
5904   movslq(dst, src);
5905 }
5906 
5907 void MacroAssembler::movptr(Register dst, int32_t src) {
5908   mov64(dst, (intptr_t)src);
5909 }
5910 
5911 void MacroAssembler::pushoop(jobject obj) {
5912   movoop(rscratch1, obj);
5913   push(rscratch1);
5914 }
5915 
5916 void MacroAssembler::pushptr(AddressLiteral src) {
5917   lea(rscratch1, src);
5918   if (src.is_lval()) {
5919     push(rscratch1);
5920   } else {
5921     pushq(Address(rscratch1, 0));
5922   }
5923 }
5924 
5925 void MacroAssembler::reset_last_Java_frame(bool clear_fp,
5926                                            bool clear_pc) {
5927   // we must set sp to zero to clear frame
5928   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
5929   // must clear fp, so that compiled frames are not confused; it is
5930   // possible that we need it only for debugging
5931   if (clear_fp) {
5932     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
5933   }
5934 
5935   if (clear_pc) {
5936     movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
5937   }
5938 }
5939 
5940 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
5941                                          Register last_java_fp,
5942                                          address  last_java_pc) {
5943   // determine last_java_sp register
5944   if (!last_java_sp->is_valid()) {
5945     last_java_sp = rsp;
5946   }
5947 
5948   // last_java_fp is optional
5949   if (last_java_fp->is_valid()) {
5950     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
5951            last_java_fp);
5952   }
5953 
5954   // last_java_pc is optional
5955   if (last_java_pc != NULL) {
5956     Address java_pc(r15_thread,
5957                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
5958     lea(rscratch1, InternalAddress(last_java_pc));
5959     movptr(java_pc, rscratch1);
5960   }
5961 
5962   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
5963 }
5964 
5965 static void pass_arg0(MacroAssembler* masm, Register arg) {
5966   if (c_rarg0 != arg ) {
5967     masm->mov(c_rarg0, arg);
5968   }
5969 }
5970 
5971 static void pass_arg1(MacroAssembler* masm, Register arg) {
5972   if (c_rarg1 != arg ) {
5973     masm->mov(c_rarg1, arg);
5974   }
5975 }
5976 
5977 static void pass_arg2(MacroAssembler* masm, Register arg) {
5978   if (c_rarg2 != arg ) {
5979     masm->mov(c_rarg2, arg);
5980   }
5981 }
5982 
5983 static void pass_arg3(MacroAssembler* masm, Register arg) {
5984   if (c_rarg3 != arg ) {
5985     masm->mov(c_rarg3, arg);
5986   }
5987 }
5988 
5989 void MacroAssembler::stop(const char* msg) {
5990   address rip = pc();
5991   pusha(); // get regs on stack
5992   lea(c_rarg0, ExternalAddress((address) msg));
5993   lea(c_rarg1, InternalAddress(rip));
5994   movq(c_rarg2, rsp); // pass pointer to regs array
5995   andq(rsp, -16); // align stack as required by ABI
5996   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
5997   hlt();
5998 }
5999 
6000 void MacroAssembler::warn(const char* msg) {
6001   push(rsp);
6002   andq(rsp, -16);     // align stack as required by push_CPU_state and call
6003 
6004   push_CPU_state();   // keeps alignment at 16 bytes
6005   lea(c_rarg0, ExternalAddress((address) msg));
6006   call_VM_leaf(CAST_FROM_FN_PTR(address, warning), c_rarg0);
6007   pop_CPU_state();
6008   pop(rsp);
6009 }
6010 
6011 #ifndef PRODUCT
6012 extern "C" void findpc(intptr_t x);
6013 #endif
6014 
6015 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
6016   // In order to get locks to work, we need to fake a in_VM state
6017   if (ShowMessageBoxOnError ) {
6018     JavaThread* thread = JavaThread::current();
6019     JavaThreadState saved_state = thread->thread_state();
6020     thread->set_thread_state(_thread_in_vm);
6021 #ifndef PRODUCT
6022     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
6023       ttyLocker ttyl;
6024       BytecodeCounter::print();
6025     }
6026 #endif
6027     // To see where a verify_oop failed, get $ebx+40/X for this frame.
6028     // XXX correct this offset for amd64
6029     // This is the value of eip which points to where verify_oop will return.
6030     if (os::message_box(msg, "Execution stopped, print registers?")) {
6031       ttyLocker ttyl;
6032       tty->print_cr("rip = 0x%016lx", pc);
6033 #ifndef PRODUCT
6034       tty->cr();
6035       findpc(pc);
6036       tty->cr();
6037 #endif
6038       tty->print_cr("rax = 0x%016lx", regs[15]);
6039       tty->print_cr("rbx = 0x%016lx", regs[12]);
6040       tty->print_cr("rcx = 0x%016lx", regs[14]);
6041       tty->print_cr("rdx = 0x%016lx", regs[13]);
6042       tty->print_cr("rdi = 0x%016lx", regs[8]);
6043       tty->print_cr("rsi = 0x%016lx", regs[9]);
6044       tty->print_cr("rbp = 0x%016lx", regs[10]);
6045       tty->print_cr("rsp = 0x%016lx", regs[11]);
6046       tty->print_cr("r8  = 0x%016lx", regs[7]);
6047       tty->print_cr("r9  = 0x%016lx", regs[6]);
6048       tty->print_cr("r10 = 0x%016lx", regs[5]);
6049       tty->print_cr("r11 = 0x%016lx", regs[4]);
6050       tty->print_cr("r12 = 0x%016lx", regs[3]);
6051       tty->print_cr("r13 = 0x%016lx", regs[2]);
6052       tty->print_cr("r14 = 0x%016lx", regs[1]);
6053       tty->print_cr("r15 = 0x%016lx", regs[0]);
6054       BREAKPOINT;
6055     }
6056     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
6057   } else {
6058     ttyLocker ttyl;
6059     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
6060                     msg);
6061     assert(false, err_msg("DEBUG MESSAGE: %s", msg));
6062   }
6063 }
6064 
6065 #endif // _LP64
6066 
6067 // Now versions that are common to 32/64 bit
6068 
6069 void MacroAssembler::addptr(Register dst, int32_t imm32) {
6070   LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
6071 }
6072 
6073 void MacroAssembler::addptr(Register dst, Register src) {
6074   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
6075 }
6076 
6077 void MacroAssembler::addptr(Address dst, Register src) {
6078   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
6079 }
6080 
6081 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
6082   if (reachable(src)) {
6083     Assembler::addsd(dst, as_Address(src));
6084   } else {
6085     lea(rscratch1, src);
6086     Assembler::addsd(dst, Address(rscratch1, 0));
6087   }
6088 }
6089 
6090 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
6091   if (reachable(src)) {
6092     addss(dst, as_Address(src));
6093   } else {
6094     lea(rscratch1, src);
6095     addss(dst, Address(rscratch1, 0));
6096   }
6097 }
6098 
6099 void MacroAssembler::align(int modulus) {
6100   if (offset() % modulus != 0) {
6101     nop(modulus - (offset() % modulus));
6102   }
6103 }
6104 
6105 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src) {
6106   // Used in sign-masking with aligned address.
6107   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
6108   if (reachable(src)) {
6109     Assembler::andpd(dst, as_Address(src));
6110   } else {
6111     lea(rscratch1, src);
6112     Assembler::andpd(dst, Address(rscratch1, 0));
6113   }
6114 }
6115 
6116 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src) {
6117   // Used in sign-masking with aligned address.
6118   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
6119   if (reachable(src)) {
6120     Assembler::andps(dst, as_Address(src));
6121   } else {
6122     lea(rscratch1, src);
6123     Assembler::andps(dst, Address(rscratch1, 0));
6124   }
6125 }
6126 
6127 void MacroAssembler::andptr(Register dst, int32_t imm32) {
6128   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
6129 }
6130 
6131 void MacroAssembler::atomic_incl(AddressLiteral counter_addr) {
6132   pushf();
6133   if (os::is_MP())
6134     lock();
6135   incrementl(counter_addr);
6136   popf();
6137 }
6138 
6139 // Writes to stack successive pages until offset reached to check for
6140 // stack overflow + shadow pages.  This clobbers tmp.
6141 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
6142   movptr(tmp, rsp);
6143   // Bang stack for total size given plus shadow page size.
6144   // Bang one page at a time because large size can bang beyond yellow and
6145   // red zones.
6146   Label loop;
6147   bind(loop);
6148   movl(Address(tmp, (-os::vm_page_size())), size );
6149   subptr(tmp, os::vm_page_size());
6150   subl(size, os::vm_page_size());
6151   jcc(Assembler::greater, loop);
6152 
6153   // Bang down shadow pages too.
6154   // The -1 because we already subtracted 1 page.
6155   for (int i = 0; i< StackShadowPages-1; i++) {
6156     // this could be any sized move but this is can be a debugging crumb
6157     // so the bigger the better.
6158     movptr(Address(tmp, (-i*os::vm_page_size())), size );
6159   }
6160 }
6161 
6162 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
6163   assert(UseBiasedLocking, "why call this otherwise?");
6164 
6165   // Check for biased locking unlock case, which is a no-op
6166   // Note: we do not have to check the thread ID for two reasons.
6167   // First, the interpreter checks for IllegalMonitorStateException at
6168   // a higher level. Second, if the bias was revoked while we held the
6169   // lock, the object could not be rebiased toward another thread, so
6170   // the bias bit would be clear.
6171   movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
6172   andptr(temp_reg, markOopDesc::biased_lock_mask_in_place);
6173   cmpptr(temp_reg, markOopDesc::biased_lock_pattern);
6174   jcc(Assembler::equal, done);
6175 }
6176 
6177 void MacroAssembler::c2bool(Register x) {
6178   // implements x == 0 ? 0 : 1
6179   // note: must only look at least-significant byte of x
6180   //       since C-style booleans are stored in one byte
6181   //       only! (was bug)
6182   andl(x, 0xFF);
6183   setb(Assembler::notZero, x);
6184 }
6185 
6186 // Wouldn't need if AddressLiteral version had new name
6187 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
6188   Assembler::call(L, rtype);
6189 }
6190 
6191 void MacroAssembler::call(Register entry) {
6192   Assembler::call(entry);
6193 }
6194 
6195 void MacroAssembler::call(AddressLiteral entry) {
6196   if (reachable(entry)) {
6197     Assembler::call_literal(entry.target(), entry.rspec());
6198   } else {
6199     lea(rscratch1, entry);
6200     Assembler::call(rscratch1);
6201   }
6202 }
6203 
6204 // Implementation of call_VM versions
6205 
6206 void MacroAssembler::call_VM(Register oop_result,
6207                              address entry_point,
6208                              bool check_exceptions) {
6209   Label C, E;
6210   call(C, relocInfo::none);
6211   jmp(E);
6212 
6213   bind(C);
6214   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
6215   ret(0);
6216 
6217   bind(E);
6218 }
6219 
6220 void MacroAssembler::call_VM(Register oop_result,
6221                              address entry_point,
6222                              Register arg_1,
6223                              bool check_exceptions) {
6224   Label C, E;
6225   call(C, relocInfo::none);
6226   jmp(E);
6227 
6228   bind(C);
6229   pass_arg1(this, arg_1);
6230   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
6231   ret(0);
6232 
6233   bind(E);
6234 }
6235 
6236 void MacroAssembler::call_VM(Register oop_result,
6237                              address entry_point,
6238                              Register arg_1,
6239                              Register arg_2,
6240                              bool check_exceptions) {
6241   Label C, E;
6242   call(C, relocInfo::none);
6243   jmp(E);
6244 
6245   bind(C);
6246 
6247   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6248 
6249   pass_arg2(this, arg_2);
6250   pass_arg1(this, arg_1);
6251   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
6252   ret(0);
6253 
6254   bind(E);
6255 }
6256 
6257 void MacroAssembler::call_VM(Register oop_result,
6258                              address entry_point,
6259                              Register arg_1,
6260                              Register arg_2,
6261                              Register arg_3,
6262                              bool check_exceptions) {
6263   Label C, E;
6264   call(C, relocInfo::none);
6265   jmp(E);
6266 
6267   bind(C);
6268 
6269   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
6270   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
6271   pass_arg3(this, arg_3);
6272 
6273   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6274   pass_arg2(this, arg_2);
6275 
6276   pass_arg1(this, arg_1);
6277   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
6278   ret(0);
6279 
6280   bind(E);
6281 }
6282 
6283 void MacroAssembler::call_VM(Register oop_result,
6284                              Register last_java_sp,
6285                              address entry_point,
6286                              int number_of_arguments,
6287                              bool check_exceptions) {
6288   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
6289   call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
6290 }
6291 
6292 void MacroAssembler::call_VM(Register oop_result,
6293                              Register last_java_sp,
6294                              address entry_point,
6295                              Register arg_1,
6296                              bool check_exceptions) {
6297   pass_arg1(this, arg_1);
6298   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
6299 }
6300 
6301 void MacroAssembler::call_VM(Register oop_result,
6302                              Register last_java_sp,
6303                              address entry_point,
6304                              Register arg_1,
6305                              Register arg_2,
6306                              bool check_exceptions) {
6307 
6308   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6309   pass_arg2(this, arg_2);
6310   pass_arg1(this, arg_1);
6311   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
6312 }
6313 
6314 void MacroAssembler::call_VM(Register oop_result,
6315                              Register last_java_sp,
6316                              address entry_point,
6317                              Register arg_1,
6318                              Register arg_2,
6319                              Register arg_3,
6320                              bool check_exceptions) {
6321   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
6322   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
6323   pass_arg3(this, arg_3);
6324   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6325   pass_arg2(this, arg_2);
6326   pass_arg1(this, arg_1);
6327   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
6328 }
6329 
6330 void MacroAssembler::super_call_VM(Register oop_result,
6331                                    Register last_java_sp,
6332                                    address entry_point,
6333                                    int number_of_arguments,
6334                                    bool check_exceptions) {
6335   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
6336   MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
6337 }
6338 
6339 void MacroAssembler::super_call_VM(Register oop_result,
6340                                    Register last_java_sp,
6341                                    address entry_point,
6342                                    Register arg_1,
6343                                    bool check_exceptions) {
6344   pass_arg1(this, arg_1);
6345   super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
6346 }
6347 
6348 void MacroAssembler::super_call_VM(Register oop_result,
6349                                    Register last_java_sp,
6350                                    address entry_point,
6351                                    Register arg_1,
6352                                    Register arg_2,
6353                                    bool check_exceptions) {
6354 
6355   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6356   pass_arg2(this, arg_2);
6357   pass_arg1(this, arg_1);
6358   super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
6359 }
6360 
6361 void MacroAssembler::super_call_VM(Register oop_result,
6362                                    Register last_java_sp,
6363                                    address entry_point,
6364                                    Register arg_1,
6365                                    Register arg_2,
6366                                    Register arg_3,
6367                                    bool check_exceptions) {
6368   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
6369   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
6370   pass_arg3(this, arg_3);
6371   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6372   pass_arg2(this, arg_2);
6373   pass_arg1(this, arg_1);
6374   super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
6375 }
6376 
6377 void MacroAssembler::call_VM_base(Register oop_result,
6378                                   Register java_thread,
6379                                   Register last_java_sp,
6380                                   address  entry_point,
6381                                   int      number_of_arguments,
6382                                   bool     check_exceptions) {
6383   // determine java_thread register
6384   if (!java_thread->is_valid()) {
6385 #ifdef _LP64
6386     java_thread = r15_thread;
6387 #else
6388     java_thread = rdi;
6389     get_thread(java_thread);
6390 #endif // LP64
6391   }
6392   // determine last_java_sp register
6393   if (!last_java_sp->is_valid()) {
6394     last_java_sp = rsp;
6395   }
6396   // debugging support
6397   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
6398   LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
6399 #ifdef ASSERT
6400   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
6401   // r12 is the heapbase.
6402   LP64_ONLY(if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base");)
6403 #endif // ASSERT
6404 
6405   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
6406   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
6407 
6408   // push java thread (becomes first argument of C function)
6409 
6410   NOT_LP64(push(java_thread); number_of_arguments++);
6411   LP64_ONLY(mov(c_rarg0, r15_thread));
6412 
6413   // set last Java frame before call
6414   assert(last_java_sp != rbp, "can't use ebp/rbp");
6415 
6416   // Only interpreter should have to set fp
6417   set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
6418 
6419   // do the call, remove parameters
6420   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
6421 
6422   // restore the thread (cannot use the pushed argument since arguments
6423   // may be overwritten by C code generated by an optimizing compiler);
6424   // however can use the register value directly if it is callee saved.
6425   if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
6426     // rdi & rsi (also r15) are callee saved -> nothing to do
6427 #ifdef ASSERT
6428     guarantee(java_thread != rax, "change this code");
6429     push(rax);
6430     { Label L;
6431       get_thread(rax);
6432       cmpptr(java_thread, rax);
6433       jcc(Assembler::equal, L);
6434       stop("MacroAssembler::call_VM_base: rdi not callee saved?");
6435       bind(L);
6436     }
6437     pop(rax);
6438 #endif
6439   } else {
6440     get_thread(java_thread);
6441   }
6442   // reset last Java frame
6443   // Only interpreter should have to clear fp
6444   reset_last_Java_frame(java_thread, true, false);
6445 
6446 #ifndef CC_INTERP
6447    // C++ interp handles this in the interpreter
6448   check_and_handle_popframe(java_thread);
6449   check_and_handle_earlyret(java_thread);
6450 #endif /* CC_INTERP */
6451 
6452   if (check_exceptions) {
6453     // check for pending exceptions (java_thread is set upon return)
6454     cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
6455 #ifndef _LP64
6456     jump_cc(Assembler::notEqual,
6457             RuntimeAddress(StubRoutines::forward_exception_entry()));
6458 #else
6459     // This used to conditionally jump to forward_exception however it is
6460     // possible if we relocate that the branch will not reach. So we must jump
6461     // around so we can always reach
6462 
6463     Label ok;
6464     jcc(Assembler::equal, ok);
6465     jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
6466     bind(ok);
6467 #endif // LP64
6468   }
6469 
6470   // get oop result if there is one and reset the value in the thread
6471   if (oop_result->is_valid()) {
6472     movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
6473     movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
6474     verify_oop(oop_result, "broken oop in call_VM_base");
6475   }
6476 }
6477 
6478 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
6479 
6480   // Calculate the value for last_Java_sp
6481   // somewhat subtle. call_VM does an intermediate call
6482   // which places a return address on the stack just under the
6483   // stack pointer as the user finsihed with it. This allows
6484   // use to retrieve last_Java_pc from last_Java_sp[-1].
6485   // On 32bit we then have to push additional args on the stack to accomplish
6486   // the actual requested call. On 64bit call_VM only can use register args
6487   // so the only extra space is the return address that call_VM created.
6488   // This hopefully explains the calculations here.
6489 
6490 #ifdef _LP64
6491   // We've pushed one address, correct last_Java_sp
6492   lea(rax, Address(rsp, wordSize));
6493 #else
6494   lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
6495 #endif // LP64
6496 
6497   call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
6498 
6499 }
6500 
6501 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
6502   call_VM_leaf_base(entry_point, number_of_arguments);
6503 }
6504 
6505 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
6506   pass_arg0(this, arg_0);
6507   call_VM_leaf(entry_point, 1);
6508 }
6509 
6510 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
6511 
6512   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
6513   pass_arg1(this, arg_1);
6514   pass_arg0(this, arg_0);
6515   call_VM_leaf(entry_point, 2);
6516 }
6517 
6518 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
6519   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
6520   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6521   pass_arg2(this, arg_2);
6522   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
6523   pass_arg1(this, arg_1);
6524   pass_arg0(this, arg_0);
6525   call_VM_leaf(entry_point, 3);
6526 }
6527 
6528 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
6529   pass_arg0(this, arg_0);
6530   MacroAssembler::call_VM_leaf_base(entry_point, 1);
6531 }
6532 
6533 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
6534 
6535   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
6536   pass_arg1(this, arg_1);
6537   pass_arg0(this, arg_0);
6538   MacroAssembler::call_VM_leaf_base(entry_point, 2);
6539 }
6540 
6541 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
6542   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
6543   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6544   pass_arg2(this, arg_2);
6545   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
6546   pass_arg1(this, arg_1);
6547   pass_arg0(this, arg_0);
6548   MacroAssembler::call_VM_leaf_base(entry_point, 3);
6549 }
6550 
6551 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
6552   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
6553   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
6554   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
6555   pass_arg3(this, arg_3);
6556   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
6557   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6558   pass_arg2(this, arg_2);
6559   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
6560   pass_arg1(this, arg_1);
6561   pass_arg0(this, arg_0);
6562   MacroAssembler::call_VM_leaf_base(entry_point, 4);
6563 }
6564 
6565 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
6566 }
6567 
6568 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
6569 }
6570 
6571 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
6572   if (reachable(src1)) {
6573     cmpl(as_Address(src1), imm);
6574   } else {
6575     lea(rscratch1, src1);
6576     cmpl(Address(rscratch1, 0), imm);
6577   }
6578 }
6579 
6580 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
6581   assert(!src2.is_lval(), "use cmpptr");
6582   if (reachable(src2)) {
6583     cmpl(src1, as_Address(src2));
6584   } else {
6585     lea(rscratch1, src2);
6586     cmpl(src1, Address(rscratch1, 0));
6587   }
6588 }
6589 
6590 void MacroAssembler::cmp32(Register src1, int32_t imm) {
6591   Assembler::cmpl(src1, imm);
6592 }
6593 
6594 void MacroAssembler::cmp32(Register src1, Address src2) {
6595   Assembler::cmpl(src1, src2);
6596 }
6597 
6598 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
6599   ucomisd(opr1, opr2);
6600 
6601   Label L;
6602   if (unordered_is_less) {
6603     movl(dst, -1);
6604     jcc(Assembler::parity, L);
6605     jcc(Assembler::below , L);
6606     movl(dst, 0);
6607     jcc(Assembler::equal , L);
6608     increment(dst);
6609   } else { // unordered is greater
6610     movl(dst, 1);
6611     jcc(Assembler::parity, L);
6612     jcc(Assembler::above , L);
6613     movl(dst, 0);
6614     jcc(Assembler::equal , L);
6615     decrementl(dst);
6616   }
6617   bind(L);
6618 }
6619 
6620 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
6621   ucomiss(opr1, opr2);
6622 
6623   Label L;
6624   if (unordered_is_less) {
6625     movl(dst, -1);
6626     jcc(Assembler::parity, L);
6627     jcc(Assembler::below , L);
6628     movl(dst, 0);
6629     jcc(Assembler::equal , L);
6630     increment(dst);
6631   } else { // unordered is greater
6632     movl(dst, 1);
6633     jcc(Assembler::parity, L);
6634     jcc(Assembler::above , L);
6635     movl(dst, 0);
6636     jcc(Assembler::equal , L);
6637     decrementl(dst);
6638   }
6639   bind(L);
6640 }
6641 
6642 
6643 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
6644   if (reachable(src1)) {
6645     cmpb(as_Address(src1), imm);
6646   } else {
6647     lea(rscratch1, src1);
6648     cmpb(Address(rscratch1, 0), imm);
6649   }
6650 }
6651 
6652 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
6653 #ifdef _LP64
6654   if (src2.is_lval()) {
6655     movptr(rscratch1, src2);
6656     Assembler::cmpq(src1, rscratch1);
6657   } else if (reachable(src2)) {
6658     cmpq(src1, as_Address(src2));
6659   } else {
6660     lea(rscratch1, src2);
6661     Assembler::cmpq(src1, Address(rscratch1, 0));
6662   }
6663 #else
6664   if (src2.is_lval()) {
6665     cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
6666   } else {
6667     cmpl(src1, as_Address(src2));
6668   }
6669 #endif // _LP64
6670 }
6671 
6672 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
6673   assert(src2.is_lval(), "not a mem-mem compare");
6674 #ifdef _LP64
6675   // moves src2's literal address
6676   movptr(rscratch1, src2);
6677   Assembler::cmpq(src1, rscratch1);
6678 #else
6679   cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
6680 #endif // _LP64
6681 }
6682 
6683 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
6684   if (reachable(adr)) {
6685     if (os::is_MP())
6686       lock();
6687     cmpxchgptr(reg, as_Address(adr));
6688   } else {
6689     lea(rscratch1, adr);
6690     if (os::is_MP())
6691       lock();
6692     cmpxchgptr(reg, Address(rscratch1, 0));
6693   }
6694 }
6695 
6696 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
6697   LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
6698 }
6699 
6700 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
6701   if (reachable(src)) {
6702     Assembler::comisd(dst, as_Address(src));
6703   } else {
6704     lea(rscratch1, src);
6705     Assembler::comisd(dst, Address(rscratch1, 0));
6706   }
6707 }
6708 
6709 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
6710   if (reachable(src)) {
6711     Assembler::comiss(dst, as_Address(src));
6712   } else {
6713     lea(rscratch1, src);
6714     Assembler::comiss(dst, Address(rscratch1, 0));
6715   }
6716 }
6717 
6718 
6719 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
6720   Condition negated_cond = negate_condition(cond);
6721   Label L;
6722   jcc(negated_cond, L);
6723   atomic_incl(counter_addr);
6724   bind(L);
6725 }
6726 
6727 int MacroAssembler::corrected_idivl(Register reg) {
6728   // Full implementation of Java idiv and irem; checks for
6729   // special case as described in JVM spec., p.243 & p.271.
6730   // The function returns the (pc) offset of the idivl
6731   // instruction - may be needed for implicit exceptions.
6732   //
6733   //         normal case                           special case
6734   //
6735   // input : rax,: dividend                         min_int
6736   //         reg: divisor   (may not be rax,/rdx)   -1
6737   //
6738   // output: rax,: quotient  (= rax, idiv reg)       min_int
6739   //         rdx: remainder (= rax, irem reg)       0
6740   assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
6741   const int min_int = 0x80000000;
6742   Label normal_case, special_case;
6743 
6744   // check for special case
6745   cmpl(rax, min_int);
6746   jcc(Assembler::notEqual, normal_case);
6747   xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
6748   cmpl(reg, -1);
6749   jcc(Assembler::equal, special_case);
6750 
6751   // handle normal case
6752   bind(normal_case);
6753   cdql();
6754   int idivl_offset = offset();
6755   idivl(reg);
6756 
6757   // normal and special case exit
6758   bind(special_case);
6759 
6760   return idivl_offset;
6761 }
6762 
6763 
6764 
6765 void MacroAssembler::decrementl(Register reg, int value) {
6766   if (value == min_jint) {subl(reg, value) ; return; }
6767   if (value <  0) { incrementl(reg, -value); return; }
6768   if (value == 0) {                        ; return; }
6769   if (value == 1 && UseIncDec) { decl(reg) ; return; }
6770   /* else */      { subl(reg, value)       ; return; }
6771 }
6772 
6773 void MacroAssembler::decrementl(Address dst, int value) {
6774   if (value == min_jint) {subl(dst, value) ; return; }
6775   if (value <  0) { incrementl(dst, -value); return; }
6776   if (value == 0) {                        ; return; }
6777   if (value == 1 && UseIncDec) { decl(dst) ; return; }
6778   /* else */      { subl(dst, value)       ; return; }
6779 }
6780 
6781 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
6782   assert (shift_value > 0, "illegal shift value");
6783   Label _is_positive;
6784   testl (reg, reg);
6785   jcc (Assembler::positive, _is_positive);
6786   int offset = (1 << shift_value) - 1 ;
6787 
6788   if (offset == 1) {
6789     incrementl(reg);
6790   } else {
6791     addl(reg, offset);
6792   }
6793 
6794   bind (_is_positive);
6795   sarl(reg, shift_value);
6796 }
6797 
6798 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
6799   if (reachable(src)) {
6800     Assembler::divsd(dst, as_Address(src));
6801   } else {
6802     lea(rscratch1, src);
6803     Assembler::divsd(dst, Address(rscratch1, 0));
6804   }
6805 }
6806 
6807 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
6808   if (reachable(src)) {
6809     Assembler::divss(dst, as_Address(src));
6810   } else {
6811     lea(rscratch1, src);
6812     Assembler::divss(dst, Address(rscratch1, 0));
6813   }
6814 }
6815 
6816 // !defined(COMPILER2) is because of stupid core builds
6817 #if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2)
6818 void MacroAssembler::empty_FPU_stack() {
6819   if (VM_Version::supports_mmx()) {
6820     emms();
6821   } else {
6822     for (int i = 8; i-- > 0; ) ffree(i);
6823   }
6824 }
6825 #endif // !LP64 || C1 || !C2
6826 
6827 
6828 // Defines obj, preserves var_size_in_bytes
6829 void MacroAssembler::eden_allocate(Register obj,
6830                                    Register var_size_in_bytes,
6831                                    int con_size_in_bytes,
6832                                    Register t1,
6833                                    Label& slow_case) {
6834   assert(obj == rax, "obj must be in rax, for cmpxchg");
6835   assert_different_registers(obj, var_size_in_bytes, t1);
6836   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
6837     jmp(slow_case);
6838   } else {
6839     Register end = t1;
6840     Label retry;
6841     bind(retry);
6842     ExternalAddress heap_top((address) Universe::heap()->top_addr());
6843     movptr(obj, heap_top);
6844     if (var_size_in_bytes == noreg) {
6845       lea(end, Address(obj, con_size_in_bytes));
6846     } else {
6847       lea(end, Address(obj, var_size_in_bytes, Address::times_1));
6848     }
6849     // if end < obj then we wrapped around => object too long => slow case
6850     cmpptr(end, obj);
6851     jcc(Assembler::below, slow_case);
6852     cmpptr(end, ExternalAddress((address) Universe::heap()->end_addr()));
6853     jcc(Assembler::above, slow_case);
6854     // Compare obj with the top addr, and if still equal, store the new top addr in
6855     // end at the address of the top addr pointer. Sets ZF if was equal, and clears
6856     // it otherwise. Use lock prefix for atomicity on MPs.
6857     locked_cmpxchgptr(end, heap_top);
6858     jcc(Assembler::notEqual, retry);
6859   }
6860 }
6861 
6862 void MacroAssembler::enter() {
6863   push(rbp);
6864   mov(rbp, rsp);
6865 }
6866 
6867 // A 5 byte nop that is safe for patching (see patch_verified_entry)
6868 void MacroAssembler::fat_nop() {
6869   if (UseAddressNop) {
6870     addr_nop_5();
6871   } else {
6872     emit_byte(0x26); // es:
6873     emit_byte(0x2e); // cs:
6874     emit_byte(0x64); // fs:
6875     emit_byte(0x65); // gs:
6876     emit_byte(0x90);
6877   }
6878 }
6879 
6880 void MacroAssembler::fcmp(Register tmp) {
6881   fcmp(tmp, 1, true, true);
6882 }
6883 
6884 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
6885   assert(!pop_right || pop_left, "usage error");
6886   if (VM_Version::supports_cmov()) {
6887     assert(tmp == noreg, "unneeded temp");
6888     if (pop_left) {
6889       fucomip(index);
6890     } else {
6891       fucomi(index);
6892     }
6893     if (pop_right) {
6894       fpop();
6895     }
6896   } else {
6897     assert(tmp != noreg, "need temp");
6898     if (pop_left) {
6899       if (pop_right) {
6900         fcompp();
6901       } else {
6902         fcomp(index);
6903       }
6904     } else {
6905       fcom(index);
6906     }
6907     // convert FPU condition into eflags condition via rax,
6908     save_rax(tmp);
6909     fwait(); fnstsw_ax();
6910     sahf();
6911     restore_rax(tmp);
6912   }
6913   // condition codes set as follows:
6914   //
6915   // CF (corresponds to C0) if x < y
6916   // PF (corresponds to C2) if unordered
6917   // ZF (corresponds to C3) if x = y
6918 }
6919 
6920 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
6921   fcmp2int(dst, unordered_is_less, 1, true, true);
6922 }
6923 
6924 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
6925   fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
6926   Label L;
6927   if (unordered_is_less) {
6928     movl(dst, -1);
6929     jcc(Assembler::parity, L);
6930     jcc(Assembler::below , L);
6931     movl(dst, 0);
6932     jcc(Assembler::equal , L);
6933     increment(dst);
6934   } else { // unordered is greater
6935     movl(dst, 1);
6936     jcc(Assembler::parity, L);
6937     jcc(Assembler::above , L);
6938     movl(dst, 0);
6939     jcc(Assembler::equal , L);
6940     decrementl(dst);
6941   }
6942   bind(L);
6943 }
6944 
6945 void MacroAssembler::fld_d(AddressLiteral src) {
6946   fld_d(as_Address(src));
6947 }
6948 
6949 void MacroAssembler::fld_s(AddressLiteral src) {
6950   fld_s(as_Address(src));
6951 }
6952 
6953 void MacroAssembler::fld_x(AddressLiteral src) {
6954   Assembler::fld_x(as_Address(src));
6955 }
6956 
6957 void MacroAssembler::fldcw(AddressLiteral src) {
6958   Assembler::fldcw(as_Address(src));
6959 }
6960 
6961 void MacroAssembler::pow_exp_core_encoding() {
6962   // kills rax, rcx, rdx
6963   subptr(rsp,sizeof(jdouble));
6964   // computes 2^X. Stack: X ...
6965   // f2xm1 computes 2^X-1 but only operates on -1<=X<=1. Get int(X) and
6966   // keep it on the thread's stack to compute 2^int(X) later
6967   // then compute 2^(X-int(X)) as (2^(X-int(X)-1+1)
6968   // final result is obtained with: 2^X = 2^int(X) * 2^(X-int(X))
6969   fld_s(0);                 // Stack: X X ...
6970   frndint();                // Stack: int(X) X ...
6971   fsuba(1);                 // Stack: int(X) X-int(X) ...
6972   fistp_s(Address(rsp,0));  // move int(X) as integer to thread's stack. Stack: X-int(X) ...
6973   f2xm1();                  // Stack: 2^(X-int(X))-1 ...
6974   fld1();                   // Stack: 1 2^(X-int(X))-1 ...
6975   faddp(1);                 // Stack: 2^(X-int(X))
6976   // computes 2^(int(X)): add exponent bias (1023) to int(X), then
6977   // shift int(X)+1023 to exponent position.
6978   // Exponent is limited to 11 bits if int(X)+1023 does not fit in 11
6979   // bits, set result to NaN. 0x000 and 0x7FF are reserved exponent
6980   // values so detect them and set result to NaN.
6981   movl(rax,Address(rsp,0));
6982   movl(rcx, -2048); // 11 bit mask and valid NaN binary encoding
6983   addl(rax, 1023);
6984   movl(rdx,rax);
6985   shll(rax,20);
6986   // Check that 0 < int(X)+1023 < 2047. Otherwise set rax to NaN.
6987   addl(rdx,1);
6988   // Check that 1 < int(X)+1023+1 < 2048
6989   // in 3 steps:
6990   // 1- (int(X)+1023+1)&-2048 == 0 => 0 <= int(X)+1023+1 < 2048
6991   // 2- (int(X)+1023+1)&-2048 != 0
6992   // 3- (int(X)+1023+1)&-2048 != 1
6993   // Do 2- first because addl just updated the flags.
6994   cmov32(Assembler::equal,rax,rcx);
6995   cmpl(rdx,1);
6996   cmov32(Assembler::equal,rax,rcx);
6997   testl(rdx,rcx);
6998   cmov32(Assembler::notEqual,rax,rcx);
6999   movl(Address(rsp,4),rax);
7000   movl(Address(rsp,0),0);
7001   fmul_d(Address(rsp,0));   // Stack: 2^X ...
7002   addptr(rsp,sizeof(jdouble));
7003 }
7004 
7005 void MacroAssembler::fast_pow() {
7006   // computes X^Y = 2^(Y * log2(X))
7007   // if fast computation is not possible, result is NaN. Requires
7008   // fallback from user of this macro.
7009   fyl2x();                 // Stack: (Y*log2(X)) ...
7010   pow_exp_core_encoding(); // Stack: exp(X) ...
7011 }
7012 
7013 void MacroAssembler::fast_exp() {
7014   // computes exp(X) = 2^(X * log2(e))
7015   // if fast computation is not possible, result is NaN. Requires
7016   // fallback from user of this macro.
7017   fldl2e();                // Stack: log2(e) X ...
7018   fmulp(1);                // Stack: (X*log2(e)) ...
7019   pow_exp_core_encoding(); // Stack: exp(X) ...
7020 }
7021 
7022 void MacroAssembler::pow_or_exp(bool is_exp, int num_fpu_regs_in_use) {
7023   // kills rax, rcx, rdx
7024   // pow and exp needs 2 extra registers on the fpu stack.
7025   Label slow_case, done;
7026   Register tmp = noreg;
7027   if (!VM_Version::supports_cmov()) {
7028     // fcmp needs a temporary so preserve rdx,
7029     tmp = rdx;
7030   }
7031   Register tmp2 = rax;
7032   Register tmp3 = rcx;
7033 
7034   if (is_exp) {
7035     // Stack: X
7036     fld_s(0);                   // duplicate argument for runtime call. Stack: X X
7037     fast_exp();                 // Stack: exp(X) X
7038     fcmp(tmp, 0, false, false); // Stack: exp(X) X
7039     // exp(X) not equal to itself: exp(X) is NaN go to slow case.
7040     jcc(Assembler::parity, slow_case);
7041     // get rid of duplicate argument. Stack: exp(X)
7042     if (num_fpu_regs_in_use > 0) {
7043       fxch();
7044       fpop();
7045     } else {
7046       ffree(1);
7047     }
7048     jmp(done);
7049   } else {
7050     // Stack: X Y
7051     Label x_negative, y_odd;
7052 
7053     fldz();                     // Stack: 0 X Y
7054     fcmp(tmp, 1, true, false);  // Stack: X Y
7055     jcc(Assembler::above, x_negative);
7056 
7057     // X >= 0
7058 
7059     fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
7060     fld_s(1);                   // Stack: X Y X Y
7061     fast_pow();                 // Stack: X^Y X Y
7062     fcmp(tmp, 0, false, false); // Stack: X^Y X Y
7063     // X^Y not equal to itself: X^Y is NaN go to slow case.
7064     jcc(Assembler::parity, slow_case);
7065     // get rid of duplicate arguments. Stack: X^Y
7066     if (num_fpu_regs_in_use > 0) {
7067       fxch(); fpop();
7068       fxch(); fpop();
7069     } else {
7070       ffree(2);
7071       ffree(1);
7072     }
7073     jmp(done);
7074 
7075     // X <= 0
7076     bind(x_negative);
7077 
7078     fld_s(1);                   // Stack: Y X Y
7079     frndint();                  // Stack: int(Y) X Y
7080     fcmp(tmp, 2, false, false); // Stack: int(Y) X Y
7081     jcc(Assembler::notEqual, slow_case);
7082 
7083     subptr(rsp, 8);
7084 
7085     // For X^Y, when X < 0, Y has to be an integer and the final
7086     // result depends on whether it's odd or even. We just checked
7087     // that int(Y) == Y.  We move int(Y) to gp registers as a 64 bit
7088     // integer to test its parity. If int(Y) is huge and doesn't fit
7089     // in the 64 bit integer range, the integer indefinite value will
7090     // end up in the gp registers. Huge numbers are all even, the
7091     // integer indefinite number is even so it's fine.
7092 
7093 #ifdef ASSERT
7094     // Let's check we don't end up with an integer indefinite number
7095     // when not expected. First test for huge numbers: check whether
7096     // int(Y)+1 == int(Y) which is true for very large numbers and
7097     // those are all even. A 64 bit integer is guaranteed to not
7098     // overflow for numbers where y+1 != y (when precision is set to
7099     // double precision).
7100     Label y_not_huge;
7101 
7102     fld1();                     // Stack: 1 int(Y) X Y
7103     fadd(1);                    // Stack: 1+int(Y) int(Y) X Y
7104 
7105 #ifdef _LP64
7106     // trip to memory to force the precision down from double extended
7107     // precision
7108     fstp_d(Address(rsp, 0));
7109     fld_d(Address(rsp, 0));
7110 #endif
7111 
7112     fcmp(tmp, 1, true, false);  // Stack: int(Y) X Y
7113 #endif
7114 
7115     // move int(Y) as 64 bit integer to thread's stack
7116     fistp_d(Address(rsp,0));    // Stack: X Y
7117 
7118 #ifdef ASSERT
7119     jcc(Assembler::notEqual, y_not_huge);
7120 
7121     // Y is huge so we know it's even. It may not fit in a 64 bit
7122     // integer and we don't want the debug code below to see the
7123     // integer indefinite value so overwrite int(Y) on the thread's
7124     // stack with 0.
7125     movl(Address(rsp, 0), 0);
7126     movl(Address(rsp, 4), 0);
7127 
7128     bind(y_not_huge);
7129 #endif
7130 
7131     fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
7132     fld_s(1);                   // Stack: X Y X Y
7133     fabs();                     // Stack: abs(X) Y X Y
7134     fast_pow();                 // Stack: abs(X)^Y X Y
7135     fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y
7136     // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case.
7137 
7138     pop(tmp2);
7139     NOT_LP64(pop(tmp3));
7140     jcc(Assembler::parity, slow_case);
7141 
7142 #ifdef ASSERT
7143     // Check that int(Y) is not integer indefinite value (int
7144     // overflow). Shouldn't happen because for values that would
7145     // overflow, 1+int(Y)==Y which was tested earlier.
7146 #ifndef _LP64
7147     {
7148       Label integer;
7149       testl(tmp2, tmp2);
7150       jcc(Assembler::notZero, integer);
7151       cmpl(tmp3, 0x80000000);
7152       jcc(Assembler::notZero, integer);
7153       stop("integer indefinite value shouldn't be seen here");
7154       bind(integer);
7155     }
7156 #else
7157     {
7158       Label integer;
7159       mov(tmp3, tmp2); // preserve tmp2 for parity check below
7160       shlq(tmp3, 1);
7161       jcc(Assembler::carryClear, integer);
7162       jcc(Assembler::notZero, integer);
7163       stop("integer indefinite value shouldn't be seen here");
7164       bind(integer);
7165     }
7166 #endif
7167 #endif
7168 
7169     // get rid of duplicate arguments. Stack: X^Y
7170     if (num_fpu_regs_in_use > 0) {
7171       fxch(); fpop();
7172       fxch(); fpop();
7173     } else {
7174       ffree(2);
7175       ffree(1);
7176     }
7177 
7178     testl(tmp2, 1);
7179     jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y
7180     // X <= 0, Y even: X^Y = -abs(X)^Y
7181 
7182     fchs();                     // Stack: -abs(X)^Y Y
7183     jmp(done);
7184   }
7185 
7186   // slow case: runtime call
7187   bind(slow_case);
7188 
7189   fpop();                       // pop incorrect result or int(Y)
7190 
7191   fp_runtime_fallback(is_exp ? CAST_FROM_FN_PTR(address, SharedRuntime::dexp) : CAST_FROM_FN_PTR(address, SharedRuntime::dpow),
7192                       is_exp ? 1 : 2, num_fpu_regs_in_use);
7193 
7194   // Come here with result in F-TOS
7195   bind(done);
7196 }
7197 
7198 void MacroAssembler::fpop() {
7199   ffree();
7200   fincstp();
7201 }
7202 
7203 void MacroAssembler::fremr(Register tmp) {
7204   save_rax(tmp);
7205   { Label L;
7206     bind(L);
7207     fprem();
7208     fwait(); fnstsw_ax();
7209 #ifdef _LP64
7210     testl(rax, 0x400);
7211     jcc(Assembler::notEqual, L);
7212 #else
7213     sahf();
7214     jcc(Assembler::parity, L);
7215 #endif // _LP64
7216   }
7217   restore_rax(tmp);
7218   // Result is in ST0.
7219   // Note: fxch & fpop to get rid of ST1
7220   // (otherwise FPU stack could overflow eventually)
7221   fxch(1);
7222   fpop();
7223 }
7224 
7225 
7226 void MacroAssembler::incrementl(AddressLiteral dst) {
7227   if (reachable(dst)) {
7228     incrementl(as_Address(dst));
7229   } else {
7230     lea(rscratch1, dst);
7231     incrementl(Address(rscratch1, 0));
7232   }
7233 }
7234 
7235 void MacroAssembler::incrementl(ArrayAddress dst) {
7236   incrementl(as_Address(dst));
7237 }
7238 
7239 void MacroAssembler::incrementl(Register reg, int value) {
7240   if (value == min_jint) {addl(reg, value) ; return; }
7241   if (value <  0) { decrementl(reg, -value); return; }
7242   if (value == 0) {                        ; return; }
7243   if (value == 1 && UseIncDec) { incl(reg) ; return; }
7244   /* else */      { addl(reg, value)       ; return; }
7245 }
7246 
7247 void MacroAssembler::incrementl(Address dst, int value) {
7248   if (value == min_jint) {addl(dst, value) ; return; }
7249   if (value <  0) { decrementl(dst, -value); return; }
7250   if (value == 0) {                        ; return; }
7251   if (value == 1 && UseIncDec) { incl(dst) ; return; }
7252   /* else */      { addl(dst, value)       ; return; }
7253 }
7254 
7255 void MacroAssembler::jump(AddressLiteral dst) {
7256   if (reachable(dst)) {
7257     jmp_literal(dst.target(), dst.rspec());
7258   } else {
7259     lea(rscratch1, dst);
7260     jmp(rscratch1);
7261   }
7262 }
7263 
7264 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
7265   if (reachable(dst)) {
7266     InstructionMark im(this);
7267     relocate(dst.reloc());
7268     const int short_size = 2;
7269     const int long_size = 6;
7270     int offs = (intptr_t)dst.target() - ((intptr_t)_code_pos);
7271     if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
7272       // 0111 tttn #8-bit disp
7273       emit_byte(0x70 | cc);
7274       emit_byte((offs - short_size) & 0xFF);
7275     } else {
7276       // 0000 1111 1000 tttn #32-bit disp
7277       emit_byte(0x0F);
7278       emit_byte(0x80 | cc);
7279       emit_long(offs - long_size);
7280     }
7281   } else {
7282 #ifdef ASSERT
7283     warning("reversing conditional branch");
7284 #endif /* ASSERT */
7285     Label skip;
7286     jccb(reverse[cc], skip);
7287     lea(rscratch1, dst);
7288     Assembler::jmp(rscratch1);
7289     bind(skip);
7290   }
7291 }
7292 
7293 void MacroAssembler::ldmxcsr(AddressLiteral src) {
7294   if (reachable(src)) {
7295     Assembler::ldmxcsr(as_Address(src));
7296   } else {
7297     lea(rscratch1, src);
7298     Assembler::ldmxcsr(Address(rscratch1, 0));
7299   }
7300 }
7301 
7302 int MacroAssembler::load_signed_byte(Register dst, Address src) {
7303   int off;
7304   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
7305     off = offset();
7306     movsbl(dst, src); // movsxb
7307   } else {
7308     off = load_unsigned_byte(dst, src);
7309     shll(dst, 24);
7310     sarl(dst, 24);
7311   }
7312   return off;
7313 }
7314 
7315 // Note: load_signed_short used to be called load_signed_word.
7316 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
7317 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
7318 // The term "word" in HotSpot means a 32- or 64-bit machine word.
7319 int MacroAssembler::load_signed_short(Register dst, Address src) {
7320   int off;
7321   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
7322     // This is dubious to me since it seems safe to do a signed 16 => 64 bit
7323     // version but this is what 64bit has always done. This seems to imply
7324     // that users are only using 32bits worth.
7325     off = offset();
7326     movswl(dst, src); // movsxw
7327   } else {
7328     off = load_unsigned_short(dst, src);
7329     shll(dst, 16);
7330     sarl(dst, 16);
7331   }
7332   return off;
7333 }
7334 
7335 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
7336   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
7337   // and "3.9 Partial Register Penalties", p. 22).
7338   int off;
7339   if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
7340     off = offset();
7341     movzbl(dst, src); // movzxb
7342   } else {
7343     xorl(dst, dst);
7344     off = offset();
7345     movb(dst, src);
7346   }
7347   return off;
7348 }
7349 
7350 // Note: load_unsigned_short used to be called load_unsigned_word.
7351 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
7352   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
7353   // and "3.9 Partial Register Penalties", p. 22).
7354   int off;
7355   if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
7356     off = offset();
7357     movzwl(dst, src); // movzxw
7358   } else {
7359     xorl(dst, dst);
7360     off = offset();
7361     movw(dst, src);
7362   }
7363   return off;
7364 }
7365 
7366 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
7367   switch (size_in_bytes) {
7368 #ifndef _LP64
7369   case  8:
7370     assert(dst2 != noreg, "second dest register required");
7371     movl(dst,  src);
7372     movl(dst2, src.plus_disp(BytesPerInt));
7373     break;
7374 #else
7375   case  8:  movq(dst, src); break;
7376 #endif
7377   case  4:  movl(dst, src); break;
7378   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
7379   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
7380   default:  ShouldNotReachHere();
7381   }
7382 }
7383 
7384 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
7385   switch (size_in_bytes) {
7386 #ifndef _LP64
7387   case  8:
7388     assert(src2 != noreg, "second source register required");
7389     movl(dst,                        src);
7390     movl(dst.plus_disp(BytesPerInt), src2);
7391     break;
7392 #else
7393   case  8:  movq(dst, src); break;
7394 #endif
7395   case  4:  movl(dst, src); break;
7396   case  2:  movw(dst, src); break;
7397   case  1:  movb(dst, src); break;
7398   default:  ShouldNotReachHere();
7399   }
7400 }
7401 
7402 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
7403   if (reachable(dst)) {
7404     movl(as_Address(dst), src);
7405   } else {
7406     lea(rscratch1, dst);
7407     movl(Address(rscratch1, 0), src);
7408   }
7409 }
7410 
7411 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
7412   if (reachable(src)) {
7413     movl(dst, as_Address(src));
7414   } else {
7415     lea(rscratch1, src);
7416     movl(dst, Address(rscratch1, 0));
7417   }
7418 }
7419 
7420 // C++ bool manipulation
7421 
7422 void MacroAssembler::movbool(Register dst, Address src) {
7423   if(sizeof(bool) == 1)
7424     movb(dst, src);
7425   else if(sizeof(bool) == 2)
7426     movw(dst, src);
7427   else if(sizeof(bool) == 4)
7428     movl(dst, src);
7429   else
7430     // unsupported
7431     ShouldNotReachHere();
7432 }
7433 
7434 void MacroAssembler::movbool(Address dst, bool boolconst) {
7435   if(sizeof(bool) == 1)
7436     movb(dst, (int) boolconst);
7437   else if(sizeof(bool) == 2)
7438     movw(dst, (int) boolconst);
7439   else if(sizeof(bool) == 4)
7440     movl(dst, (int) boolconst);
7441   else
7442     // unsupported
7443     ShouldNotReachHere();
7444 }
7445 
7446 void MacroAssembler::movbool(Address dst, Register src) {
7447   if(sizeof(bool) == 1)
7448     movb(dst, src);
7449   else if(sizeof(bool) == 2)
7450     movw(dst, src);
7451   else if(sizeof(bool) == 4)
7452     movl(dst, src);
7453   else
7454     // unsupported
7455     ShouldNotReachHere();
7456 }
7457 
7458 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
7459   movb(as_Address(dst), src);
7460 }
7461 
7462 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
7463   if (reachable(src)) {
7464     if (UseXmmLoadAndClearUpper) {
7465       movsd (dst, as_Address(src));
7466     } else {
7467       movlpd(dst, as_Address(src));
7468     }
7469   } else {
7470     lea(rscratch1, src);
7471     if (UseXmmLoadAndClearUpper) {
7472       movsd (dst, Address(rscratch1, 0));
7473     } else {
7474       movlpd(dst, Address(rscratch1, 0));
7475     }
7476   }
7477 }
7478 
7479 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
7480   if (reachable(src)) {
7481     movss(dst, as_Address(src));
7482   } else {
7483     lea(rscratch1, src);
7484     movss(dst, Address(rscratch1, 0));
7485   }
7486 }
7487 
7488 void MacroAssembler::movptr(Register dst, Register src) {
7489   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
7490 }
7491 
7492 void MacroAssembler::movptr(Register dst, Address src) {
7493   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
7494 }
7495 
7496 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
7497 void MacroAssembler::movptr(Register dst, intptr_t src) {
7498   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
7499 }
7500 
7501 void MacroAssembler::movptr(Address dst, Register src) {
7502   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
7503 }
7504 
7505 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
7506   if (reachable(src)) {
7507     Assembler::movsd(dst, as_Address(src));
7508   } else {
7509     lea(rscratch1, src);
7510     Assembler::movsd(dst, Address(rscratch1, 0));
7511   }
7512 }
7513 
7514 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
7515   if (reachable(src)) {
7516     Assembler::movss(dst, as_Address(src));
7517   } else {
7518     lea(rscratch1, src);
7519     Assembler::movss(dst, Address(rscratch1, 0));
7520   }
7521 }
7522 
7523 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
7524   if (reachable(src)) {
7525     Assembler::mulsd(dst, as_Address(src));
7526   } else {
7527     lea(rscratch1, src);
7528     Assembler::mulsd(dst, Address(rscratch1, 0));
7529   }
7530 }
7531 
7532 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
7533   if (reachable(src)) {
7534     Assembler::mulss(dst, as_Address(src));
7535   } else {
7536     lea(rscratch1, src);
7537     Assembler::mulss(dst, Address(rscratch1, 0));
7538   }
7539 }
7540 
7541 void MacroAssembler::null_check(Register reg, int offset) {
7542   if (needs_explicit_null_check(offset)) {
7543     // provoke OS NULL exception if reg = NULL by
7544     // accessing M[reg] w/o changing any (non-CC) registers
7545     // NOTE: cmpl is plenty here to provoke a segv
7546     cmpptr(rax, Address(reg, 0));
7547     // Note: should probably use testl(rax, Address(reg, 0));
7548     //       may be shorter code (however, this version of
7549     //       testl needs to be implemented first)
7550   } else {
7551     // nothing to do, (later) access of M[reg + offset]
7552     // will provoke OS NULL exception if reg = NULL
7553   }
7554 }
7555 
7556 void MacroAssembler::os_breakpoint() {
7557   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
7558   // (e.g., MSVC can't call ps() otherwise)
7559   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
7560 }
7561 
7562 void MacroAssembler::pop_CPU_state() {
7563   pop_FPU_state();
7564   pop_IU_state();
7565 }
7566 
7567 void MacroAssembler::pop_FPU_state() {
7568   NOT_LP64(frstor(Address(rsp, 0));)
7569   LP64_ONLY(fxrstor(Address(rsp, 0));)
7570   addptr(rsp, FPUStateSizeInWords * wordSize);
7571 }
7572 
7573 void MacroAssembler::pop_IU_state() {
7574   popa();
7575   LP64_ONLY(addq(rsp, 8));
7576   popf();
7577 }
7578 
7579 // Save Integer and Float state
7580 // Warning: Stack must be 16 byte aligned (64bit)
7581 void MacroAssembler::push_CPU_state() {
7582   push_IU_state();
7583   push_FPU_state();
7584 }
7585 
7586 void MacroAssembler::push_FPU_state() {
7587   subptr(rsp, FPUStateSizeInWords * wordSize);
7588 #ifndef _LP64
7589   fnsave(Address(rsp, 0));
7590   fwait();
7591 #else
7592   fxsave(Address(rsp, 0));
7593 #endif // LP64
7594 }
7595 
7596 void MacroAssembler::push_IU_state() {
7597   // Push flags first because pusha kills them
7598   pushf();
7599   // Make sure rsp stays 16-byte aligned
7600   LP64_ONLY(subq(rsp, 8));
7601   pusha();
7602 }
7603 
7604 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp, bool clear_pc) {
7605   // determine java_thread register
7606   if (!java_thread->is_valid()) {
7607     java_thread = rdi;
7608     get_thread(java_thread);
7609   }
7610   // we must set sp to zero to clear frame
7611   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
7612   if (clear_fp) {
7613     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
7614   }
7615 
7616   if (clear_pc)
7617     movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
7618 
7619 }
7620 
7621 void MacroAssembler::restore_rax(Register tmp) {
7622   if (tmp == noreg) pop(rax);
7623   else if (tmp != rax) mov(rax, tmp);
7624 }
7625 
7626 void MacroAssembler::round_to(Register reg, int modulus) {
7627   addptr(reg, modulus - 1);
7628   andptr(reg, -modulus);
7629 }
7630 
7631 void MacroAssembler::save_rax(Register tmp) {
7632   if (tmp == noreg) push(rax);
7633   else if (tmp != rax) mov(tmp, rax);
7634 }
7635 
7636 // Write serialization page so VM thread can do a pseudo remote membar.
7637 // We use the current thread pointer to calculate a thread specific
7638 // offset to write to within the page. This minimizes bus traffic
7639 // due to cache line collision.
7640 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
7641   movl(tmp, thread);
7642   shrl(tmp, os::get_serialize_page_shift_count());
7643   andl(tmp, (os::vm_page_size() - sizeof(int)));
7644 
7645   Address index(noreg, tmp, Address::times_1);
7646   ExternalAddress page(os::get_memory_serialize_page());
7647 
7648   // Size of store must match masking code above
7649   movl(as_Address(ArrayAddress(page, index)), tmp);
7650 }
7651 
7652 // Calls to C land
7653 //
7654 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
7655 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
7656 // has to be reset to 0. This is required to allow proper stack traversal.
7657 void MacroAssembler::set_last_Java_frame(Register java_thread,
7658                                          Register last_java_sp,
7659                                          Register last_java_fp,
7660                                          address  last_java_pc) {
7661   // determine java_thread register
7662   if (!java_thread->is_valid()) {
7663     java_thread = rdi;
7664     get_thread(java_thread);
7665   }
7666   // determine last_java_sp register
7667   if (!last_java_sp->is_valid()) {
7668     last_java_sp = rsp;
7669   }
7670 
7671   // last_java_fp is optional
7672 
7673   if (last_java_fp->is_valid()) {
7674     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
7675   }
7676 
7677   // last_java_pc is optional
7678 
7679   if (last_java_pc != NULL) {
7680     lea(Address(java_thread,
7681                  JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
7682         InternalAddress(last_java_pc));
7683 
7684   }
7685   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
7686 }
7687 
7688 void MacroAssembler::shlptr(Register dst, int imm8) {
7689   LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
7690 }
7691 
7692 void MacroAssembler::shrptr(Register dst, int imm8) {
7693   LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
7694 }
7695 
7696 void MacroAssembler::sign_extend_byte(Register reg) {
7697   if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
7698     movsbl(reg, reg); // movsxb
7699   } else {
7700     shll(reg, 24);
7701     sarl(reg, 24);
7702   }
7703 }
7704 
7705 void MacroAssembler::sign_extend_short(Register reg) {
7706   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
7707     movswl(reg, reg); // movsxw
7708   } else {
7709     shll(reg, 16);
7710     sarl(reg, 16);
7711   }
7712 }
7713 
7714 void MacroAssembler::testl(Register dst, AddressLiteral src) {
7715   assert(reachable(src), "Address should be reachable");
7716   testl(dst, as_Address(src));
7717 }
7718 
7719 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
7720   if (reachable(src)) {
7721     Assembler::sqrtsd(dst, as_Address(src));
7722   } else {
7723     lea(rscratch1, src);
7724     Assembler::sqrtsd(dst, Address(rscratch1, 0));
7725   }
7726 }
7727 
7728 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
7729   if (reachable(src)) {
7730     Assembler::sqrtss(dst, as_Address(src));
7731   } else {
7732     lea(rscratch1, src);
7733     Assembler::sqrtss(dst, Address(rscratch1, 0));
7734   }
7735 }
7736 
7737 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
7738   if (reachable(src)) {
7739     Assembler::subsd(dst, as_Address(src));
7740   } else {
7741     lea(rscratch1, src);
7742     Assembler::subsd(dst, Address(rscratch1, 0));
7743   }
7744 }
7745 
7746 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
7747   if (reachable(src)) {
7748     Assembler::subss(dst, as_Address(src));
7749   } else {
7750     lea(rscratch1, src);
7751     Assembler::subss(dst, Address(rscratch1, 0));
7752   }
7753 }
7754 
7755 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
7756   if (reachable(src)) {
7757     Assembler::ucomisd(dst, as_Address(src));
7758   } else {
7759     lea(rscratch1, src);
7760     Assembler::ucomisd(dst, Address(rscratch1, 0));
7761   }
7762 }
7763 
7764 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
7765   if (reachable(src)) {
7766     Assembler::ucomiss(dst, as_Address(src));
7767   } else {
7768     lea(rscratch1, src);
7769     Assembler::ucomiss(dst, Address(rscratch1, 0));
7770   }
7771 }
7772 
7773 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
7774   // Used in sign-bit flipping with aligned address.
7775   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
7776   if (reachable(src)) {
7777     Assembler::xorpd(dst, as_Address(src));
7778   } else {
7779     lea(rscratch1, src);
7780     Assembler::xorpd(dst, Address(rscratch1, 0));
7781   }
7782 }
7783 
7784 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
7785   // Used in sign-bit flipping with aligned address.
7786   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
7787   if (reachable(src)) {
7788     Assembler::xorps(dst, as_Address(src));
7789   } else {
7790     lea(rscratch1, src);
7791     Assembler::xorps(dst, Address(rscratch1, 0));
7792   }
7793 }
7794 
7795 // AVX 3-operands instructions
7796 
7797 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7798   if (reachable(src)) {
7799     vaddsd(dst, nds, as_Address(src));
7800   } else {
7801     lea(rscratch1, src);
7802     vaddsd(dst, nds, Address(rscratch1, 0));
7803   }
7804 }
7805 
7806 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7807   if (reachable(src)) {
7808     vaddss(dst, nds, as_Address(src));
7809   } else {
7810     lea(rscratch1, src);
7811     vaddss(dst, nds, Address(rscratch1, 0));
7812   }
7813 }
7814 
7815 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7816   if (reachable(src)) {
7817     vandpd(dst, nds, as_Address(src));
7818   } else {
7819     lea(rscratch1, src);
7820     vandpd(dst, nds, Address(rscratch1, 0));
7821   }
7822 }
7823 
7824 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7825   if (reachable(src)) {
7826     vandps(dst, nds, as_Address(src));
7827   } else {
7828     lea(rscratch1, src);
7829     vandps(dst, nds, Address(rscratch1, 0));
7830   }
7831 }
7832 
7833 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7834   if (reachable(src)) {
7835     vdivsd(dst, nds, as_Address(src));
7836   } else {
7837     lea(rscratch1, src);
7838     vdivsd(dst, nds, Address(rscratch1, 0));
7839   }
7840 }
7841 
7842 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7843   if (reachable(src)) {
7844     vdivss(dst, nds, as_Address(src));
7845   } else {
7846     lea(rscratch1, src);
7847     vdivss(dst, nds, Address(rscratch1, 0));
7848   }
7849 }
7850 
7851 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7852   if (reachable(src)) {
7853     vmulsd(dst, nds, as_Address(src));
7854   } else {
7855     lea(rscratch1, src);
7856     vmulsd(dst, nds, Address(rscratch1, 0));
7857   }
7858 }
7859 
7860 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7861   if (reachable(src)) {
7862     vmulss(dst, nds, as_Address(src));
7863   } else {
7864     lea(rscratch1, src);
7865     vmulss(dst, nds, Address(rscratch1, 0));
7866   }
7867 }
7868 
7869 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7870   if (reachable(src)) {
7871     vsubsd(dst, nds, as_Address(src));
7872   } else {
7873     lea(rscratch1, src);
7874     vsubsd(dst, nds, Address(rscratch1, 0));
7875   }
7876 }
7877 
7878 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7879   if (reachable(src)) {
7880     vsubss(dst, nds, as_Address(src));
7881   } else {
7882     lea(rscratch1, src);
7883     vsubss(dst, nds, Address(rscratch1, 0));
7884   }
7885 }
7886 
7887 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7888   if (reachable(src)) {
7889     vxorpd(dst, nds, as_Address(src));
7890   } else {
7891     lea(rscratch1, src);
7892     vxorpd(dst, nds, Address(rscratch1, 0));
7893   }
7894 }
7895 
7896 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
7897   if (reachable(src)) {
7898     vxorps(dst, nds, as_Address(src));
7899   } else {
7900     lea(rscratch1, src);
7901     vxorps(dst, nds, Address(rscratch1, 0));
7902   }
7903 }
7904 
7905 
7906 //////////////////////////////////////////////////////////////////////////////////
7907 #ifndef SERIALGC
7908 
7909 void MacroAssembler::g1_write_barrier_pre(Register obj,
7910                                           Register pre_val,
7911                                           Register thread,
7912                                           Register tmp,
7913                                           bool tosca_live,
7914                                           bool expand_call) {
7915 
7916   // If expand_call is true then we expand the call_VM_leaf macro
7917   // directly to skip generating the check by
7918   // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
7919 
7920 #ifdef _LP64
7921   assert(thread == r15_thread, "must be");
7922 #endif // _LP64
7923 
7924   Label done;
7925   Label runtime;
7926 
7927   assert(pre_val != noreg, "check this code");
7928 
7929   if (obj != noreg) {
7930     assert_different_registers(obj, pre_val, tmp);
7931     assert(pre_val != rax, "check this code");
7932   }
7933 
7934   Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
7935                                        PtrQueue::byte_offset_of_active()));
7936   Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
7937                                        PtrQueue::byte_offset_of_index()));
7938   Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
7939                                        PtrQueue::byte_offset_of_buf()));
7940 
7941 
7942   // Is marking active?
7943   if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
7944     cmpl(in_progress, 0);
7945   } else {
7946     assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
7947     cmpb(in_progress, 0);
7948   }
7949   jcc(Assembler::equal, done);
7950 
7951   // Do we need to load the previous value?
7952   if (obj != noreg) {
7953     load_heap_oop(pre_val, Address(obj, 0));
7954   }
7955 
7956   // Is the previous value null?
7957   cmpptr(pre_val, (int32_t) NULL_WORD);
7958   jcc(Assembler::equal, done);
7959 
7960   // Can we store original value in the thread's buffer?
7961   // Is index == 0?
7962   // (The index field is typed as size_t.)
7963 
7964   movptr(tmp, index);                   // tmp := *index_adr
7965   cmpptr(tmp, 0);                       // tmp == 0?
7966   jcc(Assembler::equal, runtime);       // If yes, goto runtime
7967 
7968   subptr(tmp, wordSize);                // tmp := tmp - wordSize
7969   movptr(index, tmp);                   // *index_adr := tmp
7970   addptr(tmp, buffer);                  // tmp := tmp + *buffer_adr
7971 
7972   // Record the previous value
7973   movptr(Address(tmp, 0), pre_val);
7974   jmp(done);
7975 
7976   bind(runtime);
7977   // save the live input values
7978   if(tosca_live) push(rax);
7979 
7980   if (obj != noreg && obj != rax)
7981     push(obj);
7982 
7983   if (pre_val != rax)
7984     push(pre_val);
7985 
7986   // Calling the runtime using the regular call_VM_leaf mechanism generates
7987   // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
7988   // that checks that the *(ebp+frame::interpreter_frame_last_sp) == NULL.
7989   //
7990   // If we care generating the pre-barrier without a frame (e.g. in the
7991   // intrinsified Reference.get() routine) then ebp might be pointing to
7992   // the caller frame and so this check will most likely fail at runtime.
7993   //
7994   // Expanding the call directly bypasses the generation of the check.
7995   // So when we do not have have a full interpreter frame on the stack
7996   // expand_call should be passed true.
7997 
7998   NOT_LP64( push(thread); )
7999 
8000   if (expand_call) {
8001     LP64_ONLY( assert(pre_val != c_rarg1, "smashed arg"); )
8002     pass_arg1(this, thread);
8003     pass_arg0(this, pre_val);
8004     MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
8005   } else {
8006     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
8007   }
8008 
8009   NOT_LP64( pop(thread); )
8010 
8011   // save the live input values
8012   if (pre_val != rax)
8013     pop(pre_val);
8014 
8015   if (obj != noreg && obj != rax)
8016     pop(obj);
8017 
8018   if(tosca_live) pop(rax);
8019 
8020   bind(done);
8021 }
8022 
8023 void MacroAssembler::g1_write_barrier_post(Register store_addr,
8024                                            Register new_val,
8025                                            Register thread,
8026                                            Register tmp,
8027                                            Register tmp2) {
8028 #ifdef _LP64
8029   assert(thread == r15_thread, "must be");
8030 #endif // _LP64
8031 
8032   Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
8033                                        PtrQueue::byte_offset_of_index()));
8034   Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
8035                                        PtrQueue::byte_offset_of_buf()));
8036 
8037   BarrierSet* bs = Universe::heap()->barrier_set();
8038   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
8039   Label done;
8040   Label runtime;
8041 
8042   // Does store cross heap regions?
8043 
8044   movptr(tmp, store_addr);
8045   xorptr(tmp, new_val);
8046   shrptr(tmp, HeapRegion::LogOfHRGrainBytes);
8047   jcc(Assembler::equal, done);
8048 
8049   // crosses regions, storing NULL?
8050 
8051   cmpptr(new_val, (int32_t) NULL_WORD);
8052   jcc(Assembler::equal, done);
8053 
8054   // storing region crossing non-NULL, is card already dirty?
8055 
8056   ExternalAddress cardtable((address) ct->byte_map_base);
8057   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
8058 #ifdef _LP64
8059   const Register card_addr = tmp;
8060 
8061   movq(card_addr, store_addr);
8062   shrq(card_addr, CardTableModRefBS::card_shift);
8063 
8064   lea(tmp2, cardtable);
8065 
8066   // get the address of the card
8067   addq(card_addr, tmp2);
8068 #else
8069   const Register card_index = tmp;
8070 
8071   movl(card_index, store_addr);
8072   shrl(card_index, CardTableModRefBS::card_shift);
8073 
8074   Address index(noreg, card_index, Address::times_1);
8075   const Register card_addr = tmp;
8076   lea(card_addr, as_Address(ArrayAddress(cardtable, index)));
8077 #endif
8078   cmpb(Address(card_addr, 0), 0);
8079   jcc(Assembler::equal, done);
8080 
8081   // storing a region crossing, non-NULL oop, card is clean.
8082   // dirty card and log.
8083 
8084   movb(Address(card_addr, 0), 0);
8085 
8086   cmpl(queue_index, 0);
8087   jcc(Assembler::equal, runtime);
8088   subl(queue_index, wordSize);
8089   movptr(tmp2, buffer);
8090 #ifdef _LP64
8091   movslq(rscratch1, queue_index);
8092   addq(tmp2, rscratch1);
8093   movq(Address(tmp2, 0), card_addr);
8094 #else
8095   addl(tmp2, queue_index);
8096   movl(Address(tmp2, 0), card_index);
8097 #endif
8098   jmp(done);
8099 
8100   bind(runtime);
8101   // save the live input values
8102   push(store_addr);
8103   push(new_val);
8104 #ifdef _LP64
8105   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, r15_thread);
8106 #else
8107   push(thread);
8108   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
8109   pop(thread);
8110 #endif
8111   pop(new_val);
8112   pop(store_addr);
8113 
8114   bind(done);
8115 }
8116 
8117 #endif // SERIALGC
8118 //////////////////////////////////////////////////////////////////////////////////
8119 
8120 
8121 void MacroAssembler::store_check(Register obj) {
8122   // Does a store check for the oop in register obj. The content of
8123   // register obj is destroyed afterwards.
8124   store_check_part_1(obj);
8125   store_check_part_2(obj);
8126 }
8127 
8128 void MacroAssembler::store_check(Register obj, Address dst) {
8129   store_check(obj);
8130 }
8131 
8132 
8133 // split the store check operation so that other instructions can be scheduled inbetween
8134 void MacroAssembler::store_check_part_1(Register obj) {
8135   BarrierSet* bs = Universe::heap()->barrier_set();
8136   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
8137   shrptr(obj, CardTableModRefBS::card_shift);
8138 }
8139 
8140 void MacroAssembler::store_check_part_2(Register obj) {
8141   BarrierSet* bs = Universe::heap()->barrier_set();
8142   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
8143   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
8144   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
8145 
8146   // The calculation for byte_map_base is as follows:
8147   // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
8148   // So this essentially converts an address to a displacement and
8149   // it will never need to be relocated. On 64bit however the value may be too
8150   // large for a 32bit displacement
8151 
8152   intptr_t disp = (intptr_t) ct->byte_map_base;
8153   if (is_simm32(disp)) {
8154     Address cardtable(noreg, obj, Address::times_1, disp);
8155     movb(cardtable, 0);
8156   } else {
8157     // By doing it as an ExternalAddress disp could be converted to a rip-relative
8158     // displacement and done in a single instruction given favorable mapping and
8159     // a smarter version of as_Address. Worst case it is two instructions which
8160     // is no worse off then loading disp into a register and doing as a simple
8161     // Address() as above.
8162     // We can't do as ExternalAddress as the only style since if disp == 0 we'll
8163     // assert since NULL isn't acceptable in a reloci (see 6644928). In any case
8164     // in some cases we'll get a single instruction version.
8165 
8166     ExternalAddress cardtable((address)disp);
8167     Address index(noreg, obj, Address::times_1);
8168     movb(as_Address(ArrayAddress(cardtable, index)), 0);
8169   }
8170 }
8171 
8172 void MacroAssembler::subptr(Register dst, int32_t imm32) {
8173   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
8174 }
8175 
8176 // Force generation of a 4 byte immediate value even if it fits into 8bit
8177 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
8178   LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
8179 }
8180 
8181 void MacroAssembler::subptr(Register dst, Register src) {
8182   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
8183 }
8184 
8185 // C++ bool manipulation
8186 void MacroAssembler::testbool(Register dst) {
8187   if(sizeof(bool) == 1)
8188     testb(dst, 0xff);
8189   else if(sizeof(bool) == 2) {
8190     // testw implementation needed for two byte bools
8191     ShouldNotReachHere();
8192   } else if(sizeof(bool) == 4)
8193     testl(dst, dst);
8194   else
8195     // unsupported
8196     ShouldNotReachHere();
8197 }
8198 
8199 void MacroAssembler::testptr(Register dst, Register src) {
8200   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
8201 }
8202 
8203 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
8204 void MacroAssembler::tlab_allocate(Register obj,
8205                                    Register var_size_in_bytes,
8206                                    int con_size_in_bytes,
8207                                    Register t1,
8208                                    Register t2,
8209                                    Label& slow_case) {
8210   assert_different_registers(obj, t1, t2);
8211   assert_different_registers(obj, var_size_in_bytes, t1);
8212   Register end = t2;
8213   Register thread = NOT_LP64(t1) LP64_ONLY(r15_thread);
8214 
8215   verify_tlab();
8216 
8217   NOT_LP64(get_thread(thread));
8218 
8219   movptr(obj, Address(thread, JavaThread::tlab_top_offset()));
8220   if (var_size_in_bytes == noreg) {
8221     lea(end, Address(obj, con_size_in_bytes));
8222   } else {
8223     lea(end, Address(obj, var_size_in_bytes, Address::times_1));
8224   }
8225   cmpptr(end, Address(thread, JavaThread::tlab_end_offset()));
8226   jcc(Assembler::above, slow_case);
8227 
8228   // update the tlab top pointer
8229   movptr(Address(thread, JavaThread::tlab_top_offset()), end);
8230 
8231   // recover var_size_in_bytes if necessary
8232   if (var_size_in_bytes == end) {
8233     subptr(var_size_in_bytes, obj);
8234   }
8235   verify_tlab();
8236 }
8237 
8238 // Preserves rbx, and rdx.
8239 Register MacroAssembler::tlab_refill(Label& retry,
8240                                      Label& try_eden,
8241                                      Label& slow_case) {
8242   Register top = rax;
8243   Register t1  = rcx;
8244   Register t2  = rsi;
8245   Register thread_reg = NOT_LP64(rdi) LP64_ONLY(r15_thread);
8246   assert_different_registers(top, thread_reg, t1, t2, /* preserve: */ rbx, rdx);
8247   Label do_refill, discard_tlab;
8248 
8249   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
8250     // No allocation in the shared eden.
8251     jmp(slow_case);
8252   }
8253 
8254   NOT_LP64(get_thread(thread_reg));
8255 
8256   movptr(top, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
8257   movptr(t1,  Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
8258 
8259   // calculate amount of free space
8260   subptr(t1, top);
8261   shrptr(t1, LogHeapWordSize);
8262 
8263   // Retain tlab and allocate object in shared space if
8264   // the amount free in the tlab is too large to discard.
8265   cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
8266   jcc(Assembler::lessEqual, discard_tlab);
8267 
8268   // Retain
8269   // %%% yuck as movptr...
8270   movptr(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
8271   addptr(Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())), t2);
8272   if (TLABStats) {
8273     // increment number of slow_allocations
8274     addl(Address(thread_reg, in_bytes(JavaThread::tlab_slow_allocations_offset())), 1);
8275   }
8276   jmp(try_eden);
8277 
8278   bind(discard_tlab);
8279   if (TLABStats) {
8280     // increment number of refills
8281     addl(Address(thread_reg, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1);
8282     // accumulate wastage -- t1 is amount free in tlab
8283     addl(Address(thread_reg, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1);
8284   }
8285 
8286   // if tlab is currently allocated (top or end != null) then
8287   // fill [top, end + alignment_reserve) with array object
8288   testptr(top, top);
8289   jcc(Assembler::zero, do_refill);
8290 
8291   // set up the mark word
8292   movptr(Address(top, oopDesc::mark_offset_in_bytes()), (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
8293   // set the length to the remaining space
8294   subptr(t1, typeArrayOopDesc::header_size(T_INT));
8295   addptr(t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
8296   shlptr(t1, log2_intptr(HeapWordSize/sizeof(jint)));
8297   movl(Address(top, arrayOopDesc::length_offset_in_bytes()), t1);
8298   // set klass to intArrayKlass
8299   // dubious reloc why not an oop reloc?
8300   movptr(t1, ExternalAddress((address)Universe::intArrayKlassObj_addr()));
8301   // store klass last.  concurrent gcs assumes klass length is valid if
8302   // klass field is not null.
8303   store_klass(top, t1);
8304 
8305   movptr(t1, top);
8306   subptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
8307   incr_allocated_bytes(thread_reg, t1, 0);
8308 
8309   // refill the tlab with an eden allocation
8310   bind(do_refill);
8311   movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
8312   shlptr(t1, LogHeapWordSize);
8313   // allocate new tlab, address returned in top
8314   eden_allocate(top, t1, 0, t2, slow_case);
8315 
8316   // Check that t1 was preserved in eden_allocate.
8317 #ifdef ASSERT
8318   if (UseTLAB) {
8319     Label ok;
8320     Register tsize = rsi;
8321     assert_different_registers(tsize, thread_reg, t1);
8322     push(tsize);
8323     movptr(tsize, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
8324     shlptr(tsize, LogHeapWordSize);
8325     cmpptr(t1, tsize);
8326     jcc(Assembler::equal, ok);
8327     stop("assert(t1 != tlab size)");
8328     should_not_reach_here();
8329 
8330     bind(ok);
8331     pop(tsize);
8332   }
8333 #endif
8334   movptr(Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())), top);
8335   movptr(Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())), top);
8336   addptr(top, t1);
8337   subptr(top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
8338   movptr(Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())), top);
8339   verify_tlab();
8340   jmp(retry);
8341 
8342   return thread_reg; // for use by caller
8343 }
8344 
8345 void MacroAssembler::incr_allocated_bytes(Register thread,
8346                                           Register var_size_in_bytes,
8347                                           int con_size_in_bytes,
8348                                           Register t1) {
8349   if (!thread->is_valid()) {
8350 #ifdef _LP64
8351     thread = r15_thread;
8352 #else
8353     assert(t1->is_valid(), "need temp reg");
8354     thread = t1;
8355     get_thread(thread);
8356 #endif
8357   }
8358 
8359 #ifdef _LP64
8360   if (var_size_in_bytes->is_valid()) {
8361     addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
8362   } else {
8363     addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
8364   }
8365 #else
8366   if (var_size_in_bytes->is_valid()) {
8367     addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
8368   } else {
8369     addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
8370   }
8371   adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
8372 #endif
8373 }
8374 
8375 void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
8376   pusha();
8377 
8378   // if we are coming from c1, xmm registers may be live
8379   if (UseSSE >= 1) {
8380     subptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8));
8381   }
8382   int off = 0;
8383   if (UseSSE == 1)  {
8384     movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
8385     movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
8386     movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
8387     movflt(Address(rsp,off++*sizeof(jdouble)),xmm3);
8388     movflt(Address(rsp,off++*sizeof(jdouble)),xmm4);
8389     movflt(Address(rsp,off++*sizeof(jdouble)),xmm5);
8390     movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
8391     movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
8392   } else if (UseSSE >= 2)  {
8393     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm0);
8394     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm1);
8395     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm2);
8396     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm3);
8397     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm4);
8398     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm5);
8399     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm6);
8400     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm7);
8401 #ifdef _LP64
8402     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm8);
8403     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm9);
8404     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm10);
8405     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm11);
8406     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm12);
8407     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm13);
8408     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm14);
8409     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm15);
8410 #endif
8411   }
8412 
8413   // Preserve registers across runtime call
8414   int incoming_argument_and_return_value_offset = -1;
8415   if (num_fpu_regs_in_use > 1) {
8416     // Must preserve all other FPU regs (could alternatively convert
8417     // SharedRuntime::dsin, dcos etc. into assembly routines known not to trash
8418     // FPU state, but can not trust C compiler)
8419     NEEDS_CLEANUP;
8420     // NOTE that in this case we also push the incoming argument(s) to
8421     // the stack and restore it later; we also use this stack slot to
8422     // hold the return value from dsin, dcos etc.
8423     for (int i = 0; i < num_fpu_regs_in_use; i++) {
8424       subptr(rsp, sizeof(jdouble));
8425       fstp_d(Address(rsp, 0));
8426     }
8427     incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
8428     for (int i = nb_args-1; i >= 0; i--) {
8429       fld_d(Address(rsp, incoming_argument_and_return_value_offset-i*sizeof(jdouble)));
8430     }
8431   }
8432 
8433   subptr(rsp, nb_args*sizeof(jdouble));
8434   for (int i = 0; i < nb_args; i++) {
8435     fstp_d(Address(rsp, i*sizeof(jdouble)));
8436   }
8437 
8438 #ifdef _LP64
8439   if (nb_args > 0) {
8440     movdbl(xmm0, Address(rsp, 0));
8441   }
8442   if (nb_args > 1) {
8443     movdbl(xmm1, Address(rsp, sizeof(jdouble)));
8444   }
8445   assert(nb_args <= 2, "unsupported number of args");
8446 #endif // _LP64
8447 
8448   // NOTE: we must not use call_VM_leaf here because that requires a
8449   // complete interpreter frame in debug mode -- same bug as 4387334
8450   // MacroAssembler::call_VM_leaf_base is perfectly safe and will
8451   // do proper 64bit abi
8452 
8453   NEEDS_CLEANUP;
8454   // Need to add stack banging before this runtime call if it needs to
8455   // be taken; however, there is no generic stack banging routine at
8456   // the MacroAssembler level
8457 
8458   MacroAssembler::call_VM_leaf_base(runtime_entry, 0);
8459 
8460 #ifdef _LP64
8461   movsd(Address(rsp, 0), xmm0);
8462   fld_d(Address(rsp, 0));
8463 #endif // _LP64
8464   addptr(rsp, sizeof(jdouble) * nb_args);
8465   if (num_fpu_regs_in_use > 1) {
8466     // Must save return value to stack and then restore entire FPU
8467     // stack except incoming arguments
8468     fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
8469     for (int i = 0; i < num_fpu_regs_in_use - nb_args; i++) {
8470       fld_d(Address(rsp, 0));
8471       addptr(rsp, sizeof(jdouble));
8472     }
8473     fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
8474     addptr(rsp, sizeof(jdouble) * nb_args);
8475   }
8476 
8477   off = 0;
8478   if (UseSSE == 1)  {
8479     movflt(xmm0, Address(rsp,off++*sizeof(jdouble)));
8480     movflt(xmm1, Address(rsp,off++*sizeof(jdouble)));
8481     movflt(xmm2, Address(rsp,off++*sizeof(jdouble)));
8482     movflt(xmm3, Address(rsp,off++*sizeof(jdouble)));
8483     movflt(xmm4, Address(rsp,off++*sizeof(jdouble)));
8484     movflt(xmm5, Address(rsp,off++*sizeof(jdouble)));
8485     movflt(xmm6, Address(rsp,off++*sizeof(jdouble)));
8486     movflt(xmm7, Address(rsp,off++*sizeof(jdouble)));
8487   } else if (UseSSE >= 2)  {
8488     movdbl(xmm0, Address(rsp,off++*sizeof(jdouble)));
8489     movdbl(xmm1, Address(rsp,off++*sizeof(jdouble)));
8490     movdbl(xmm2, Address(rsp,off++*sizeof(jdouble)));
8491     movdbl(xmm3, Address(rsp,off++*sizeof(jdouble)));
8492     movdbl(xmm4, Address(rsp,off++*sizeof(jdouble)));
8493     movdbl(xmm5, Address(rsp,off++*sizeof(jdouble)));
8494     movdbl(xmm6, Address(rsp,off++*sizeof(jdouble)));
8495     movdbl(xmm7, Address(rsp,off++*sizeof(jdouble)));
8496 #ifdef _LP64
8497     movdbl(xmm8, Address(rsp,off++*sizeof(jdouble)));
8498     movdbl(xmm9, Address(rsp,off++*sizeof(jdouble)));
8499     movdbl(xmm10, Address(rsp,off++*sizeof(jdouble)));
8500     movdbl(xmm11, Address(rsp,off++*sizeof(jdouble)));
8501     movdbl(xmm12, Address(rsp,off++*sizeof(jdouble)));
8502     movdbl(xmm13, Address(rsp,off++*sizeof(jdouble)));
8503     movdbl(xmm14, Address(rsp,off++*sizeof(jdouble)));
8504     movdbl(xmm15, Address(rsp,off++*sizeof(jdouble)));
8505 #endif
8506   }
8507   if (UseSSE >= 1) {
8508     addptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8));
8509   }
8510   popa();
8511 }
8512 
8513 static const double     pi_4 =  0.7853981633974483;
8514 
8515 void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
8516   // A hand-coded argument reduction for values in fabs(pi/4, pi/2)
8517   // was attempted in this code; unfortunately it appears that the
8518   // switch to 80-bit precision and back causes this to be
8519   // unprofitable compared with simply performing a runtime call if
8520   // the argument is out of the (-pi/4, pi/4) range.
8521 
8522   Register tmp = noreg;
8523   if (!VM_Version::supports_cmov()) {
8524     // fcmp needs a temporary so preserve rbx,
8525     tmp = rbx;
8526     push(tmp);
8527   }
8528 
8529   Label slow_case, done;
8530 
8531   ExternalAddress pi4_adr = (address)&pi_4;
8532   if (reachable(pi4_adr)) {
8533     // x ?<= pi/4
8534     fld_d(pi4_adr);
8535     fld_s(1);                // Stack:  X  PI/4  X
8536     fabs();                  // Stack: |X| PI/4  X
8537     fcmp(tmp);
8538     jcc(Assembler::above, slow_case);
8539 
8540     // fastest case: -pi/4 <= x <= pi/4
8541     switch(trig) {
8542     case 's':
8543       fsin();
8544       break;
8545     case 'c':
8546       fcos();
8547       break;
8548     case 't':
8549       ftan();
8550       break;
8551     default:
8552       assert(false, "bad intrinsic");
8553       break;
8554     }
8555     jmp(done);
8556   }
8557 
8558   // slow case: runtime call
8559   bind(slow_case);
8560 
8561   switch(trig) {
8562   case 's':
8563     {
8564       fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 1, num_fpu_regs_in_use);
8565     }
8566     break;
8567   case 'c':
8568     {
8569       fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 1, num_fpu_regs_in_use);
8570     }
8571     break;
8572   case 't':
8573     {
8574       fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 1, num_fpu_regs_in_use);
8575     }
8576     break;
8577   default:
8578     assert(false, "bad intrinsic");
8579     break;
8580   }
8581 
8582   // Come here with result in F-TOS
8583   bind(done);
8584 
8585   if (tmp != noreg) {
8586     pop(tmp);
8587   }
8588 }
8589 
8590 
8591 // Look up the method for a megamorphic invokeinterface call.
8592 // The target method is determined by <intf_klass, itable_index>.
8593 // The receiver klass is in recv_klass.
8594 // On success, the result will be in method_result, and execution falls through.
8595 // On failure, execution transfers to the given label.
8596 void MacroAssembler::lookup_interface_method(Register recv_klass,
8597                                              Register intf_klass,
8598                                              RegisterOrConstant itable_index,
8599                                              Register method_result,
8600                                              Register scan_temp,
8601                                              Label& L_no_such_interface) {
8602   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
8603   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
8604          "caller must use same register for non-constant itable index as for method");
8605 
8606   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
8607   int vtable_base = instanceKlass::vtable_start_offset() * wordSize;
8608   int itentry_off = itableMethodEntry::method_offset_in_bytes();
8609   int scan_step   = itableOffsetEntry::size() * wordSize;
8610   int vte_size    = vtableEntry::size() * wordSize;
8611   Address::ScaleFactor times_vte_scale = Address::times_ptr;
8612   assert(vte_size == wordSize, "else adjust times_vte_scale");
8613 
8614   movl(scan_temp, Address(recv_klass, instanceKlass::vtable_length_offset() * wordSize));
8615 
8616   // %%% Could store the aligned, prescaled offset in the klassoop.
8617   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
8618   if (HeapWordsPerLong > 1) {
8619     // Round up to align_object_offset boundary
8620     // see code for instanceKlass::start_of_itable!
8621     round_to(scan_temp, BytesPerLong);
8622   }
8623 
8624   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
8625   assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
8626   lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
8627 
8628   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
8629   //   if (scan->interface() == intf) {
8630   //     result = (klass + scan->offset() + itable_index);
8631   //   }
8632   // }
8633   Label search, found_method;
8634 
8635   for (int peel = 1; peel >= 0; peel--) {
8636     movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
8637     cmpptr(intf_klass, method_result);
8638 
8639     if (peel) {
8640       jccb(Assembler::equal, found_method);
8641     } else {
8642       jccb(Assembler::notEqual, search);
8643       // (invert the test to fall through to found_method...)
8644     }
8645 
8646     if (!peel)  break;
8647 
8648     bind(search);
8649 
8650     // Check that the previous entry is non-null.  A null entry means that
8651     // the receiver class doesn't implement the interface, and wasn't the
8652     // same as when the caller was compiled.
8653     testptr(method_result, method_result);
8654     jcc(Assembler::zero, L_no_such_interface);
8655     addptr(scan_temp, scan_step);
8656   }
8657 
8658   bind(found_method);
8659 
8660   // Got a hit.
8661   movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
8662   movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
8663 }
8664 
8665 
8666 void MacroAssembler::check_klass_subtype(Register sub_klass,
8667                            Register super_klass,
8668                            Register temp_reg,
8669                            Label& L_success) {
8670   Label L_failure;
8671   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
8672   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
8673   bind(L_failure);
8674 }
8675 
8676 
8677 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
8678                                                    Register super_klass,
8679                                                    Register temp_reg,
8680                                                    Label* L_success,
8681                                                    Label* L_failure,
8682                                                    Label* L_slow_path,
8683                                         RegisterOrConstant super_check_offset) {
8684   assert_different_registers(sub_klass, super_klass, temp_reg);
8685   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
8686   if (super_check_offset.is_register()) {
8687     assert_different_registers(sub_klass, super_klass,
8688                                super_check_offset.as_register());
8689   } else if (must_load_sco) {
8690     assert(temp_reg != noreg, "supply either a temp or a register offset");
8691   }
8692 
8693   Label L_fallthrough;
8694   int label_nulls = 0;
8695   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
8696   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
8697   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
8698   assert(label_nulls <= 1, "at most one NULL in the batch");
8699 
8700   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
8701   int sco_offset = in_bytes(Klass::super_check_offset_offset());
8702   Address super_check_offset_addr(super_klass, sco_offset);
8703 
8704   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
8705   // range of a jccb.  If this routine grows larger, reconsider at
8706   // least some of these.
8707 #define local_jcc(assembler_cond, label)                                \
8708   if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
8709   else                             jcc( assembler_cond, label) /*omit semi*/
8710 
8711   // Hacked jmp, which may only be used just before L_fallthrough.
8712 #define final_jmp(label)                                                \
8713   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
8714   else                            jmp(label)                /*omit semi*/
8715 
8716   // If the pointers are equal, we are done (e.g., String[] elements).
8717   // This self-check enables sharing of secondary supertype arrays among
8718   // non-primary types such as array-of-interface.  Otherwise, each such
8719   // type would need its own customized SSA.
8720   // We move this check to the front of the fast path because many
8721   // type checks are in fact trivially successful in this manner,
8722   // so we get a nicely predicted branch right at the start of the check.
8723   cmpptr(sub_klass, super_klass);
8724   local_jcc(Assembler::equal, *L_success);
8725 
8726   // Check the supertype display:
8727   if (must_load_sco) {
8728     // Positive movl does right thing on LP64.
8729     movl(temp_reg, super_check_offset_addr);
8730     super_check_offset = RegisterOrConstant(temp_reg);
8731   }
8732   Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
8733   cmpptr(super_klass, super_check_addr); // load displayed supertype
8734 
8735   // This check has worked decisively for primary supers.
8736   // Secondary supers are sought in the super_cache ('super_cache_addr').
8737   // (Secondary supers are interfaces and very deeply nested subtypes.)
8738   // This works in the same check above because of a tricky aliasing
8739   // between the super_cache and the primary super display elements.
8740   // (The 'super_check_addr' can address either, as the case requires.)
8741   // Note that the cache is updated below if it does not help us find
8742   // what we need immediately.
8743   // So if it was a primary super, we can just fail immediately.
8744   // Otherwise, it's the slow path for us (no success at this point).
8745 
8746   if (super_check_offset.is_register()) {
8747     local_jcc(Assembler::equal, *L_success);
8748     cmpl(super_check_offset.as_register(), sc_offset);
8749     if (L_failure == &L_fallthrough) {
8750       local_jcc(Assembler::equal, *L_slow_path);
8751     } else {
8752       local_jcc(Assembler::notEqual, *L_failure);
8753       final_jmp(*L_slow_path);
8754     }
8755   } else if (super_check_offset.as_constant() == sc_offset) {
8756     // Need a slow path; fast failure is impossible.
8757     if (L_slow_path == &L_fallthrough) {
8758       local_jcc(Assembler::equal, *L_success);
8759     } else {
8760       local_jcc(Assembler::notEqual, *L_slow_path);
8761       final_jmp(*L_success);
8762     }
8763   } else {
8764     // No slow path; it's a fast decision.
8765     if (L_failure == &L_fallthrough) {
8766       local_jcc(Assembler::equal, *L_success);
8767     } else {
8768       local_jcc(Assembler::notEqual, *L_failure);
8769       final_jmp(*L_success);
8770     }
8771   }
8772 
8773   bind(L_fallthrough);
8774 
8775 #undef local_jcc
8776 #undef final_jmp
8777 }
8778 
8779 
8780 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
8781                                                    Register super_klass,
8782                                                    Register temp_reg,
8783                                                    Register temp2_reg,
8784                                                    Label* L_success,
8785                                                    Label* L_failure,
8786                                                    bool set_cond_codes) {
8787   assert_different_registers(sub_klass, super_klass, temp_reg);
8788   if (temp2_reg != noreg)
8789     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
8790 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
8791 
8792   Label L_fallthrough;
8793   int label_nulls = 0;
8794   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
8795   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
8796   assert(label_nulls <= 1, "at most one NULL in the batch");
8797 
8798   // a couple of useful fields in sub_klass:
8799   int ss_offset = in_bytes(Klass::secondary_supers_offset());
8800   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
8801   Address secondary_supers_addr(sub_klass, ss_offset);
8802   Address super_cache_addr(     sub_klass, sc_offset);
8803 
8804   // Do a linear scan of the secondary super-klass chain.
8805   // This code is rarely used, so simplicity is a virtue here.
8806   // The repne_scan instruction uses fixed registers, which we must spill.
8807   // Don't worry too much about pre-existing connections with the input regs.
8808 
8809   assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
8810   assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
8811 
8812   // Get super_klass value into rax (even if it was in rdi or rcx).
8813   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
8814   if (super_klass != rax || UseCompressedOops) {
8815     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
8816     mov(rax, super_klass);
8817   }
8818   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
8819   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
8820 
8821 #ifndef PRODUCT
8822   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
8823   ExternalAddress pst_counter_addr((address) pst_counter);
8824   NOT_LP64(  incrementl(pst_counter_addr) );
8825   LP64_ONLY( lea(rcx, pst_counter_addr) );
8826   LP64_ONLY( incrementl(Address(rcx, 0)) );
8827 #endif //PRODUCT
8828 
8829   // We will consult the secondary-super array.
8830   movptr(rdi, secondary_supers_addr);
8831   // Load the array length.  (Positive movl does right thing on LP64.)
8832   movl(rcx, Address(rdi, arrayOopDesc::length_offset_in_bytes()));
8833   // Skip to start of data.
8834   addptr(rdi, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
8835 
8836   // Scan RCX words at [RDI] for an occurrence of RAX.
8837   // Set NZ/Z based on last compare.
8838   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
8839   // not change flags (only scas instruction which is repeated sets flags).
8840   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
8841 #ifdef _LP64
8842   // This part is tricky, as values in supers array could be 32 or 64 bit wide
8843   // and we store values in objArrays always encoded, thus we need to encode
8844   // the value of rax before repne.  Note that rax is dead after the repne.
8845   if (UseCompressedOops) {
8846     encode_heap_oop_not_null(rax); // Changes flags.
8847     // The superclass is never null; it would be a basic system error if a null
8848     // pointer were to sneak in here.  Note that we have already loaded the
8849     // Klass::super_check_offset from the super_klass in the fast path,
8850     // so if there is a null in that register, we are already in the afterlife.
8851     testl(rax,rax); // Set Z = 0
8852     repne_scanl();
8853   } else
8854 #endif // _LP64
8855   {
8856     testptr(rax,rax); // Set Z = 0
8857     repne_scan();
8858   }
8859   // Unspill the temp. registers:
8860   if (pushed_rdi)  pop(rdi);
8861   if (pushed_rcx)  pop(rcx);
8862   if (pushed_rax)  pop(rax);
8863 
8864   if (set_cond_codes) {
8865     // Special hack for the AD files:  rdi is guaranteed non-zero.
8866     assert(!pushed_rdi, "rdi must be left non-NULL");
8867     // Also, the condition codes are properly set Z/NZ on succeed/failure.
8868   }
8869 
8870   if (L_failure == &L_fallthrough)
8871         jccb(Assembler::notEqual, *L_failure);
8872   else  jcc(Assembler::notEqual, *L_failure);
8873 
8874   // Success.  Cache the super we found and proceed in triumph.
8875   movptr(super_cache_addr, super_klass);
8876 
8877   if (L_success != &L_fallthrough) {
8878     jmp(*L_success);
8879   }
8880 
8881 #undef IS_A_TEMP
8882 
8883   bind(L_fallthrough);
8884 }
8885 
8886 
8887 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
8888   if (VM_Version::supports_cmov()) {
8889     cmovl(cc, dst, src);
8890   } else {
8891     Label L;
8892     jccb(negate_condition(cc), L);
8893     movl(dst, src);
8894     bind(L);
8895   }
8896 }
8897 
8898 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
8899   if (VM_Version::supports_cmov()) {
8900     cmovl(cc, dst, src);
8901   } else {
8902     Label L;
8903     jccb(negate_condition(cc), L);
8904     movl(dst, src);
8905     bind(L);
8906   }
8907 }
8908 
8909 void MacroAssembler::verify_oop(Register reg, const char* s) {
8910   if (!VerifyOops) return;
8911 
8912   // Pass register number to verify_oop_subroutine
8913   char* b = new char[strlen(s) + 50];
8914   sprintf(b, "verify_oop: %s: %s", reg->name(), s);
8915 #ifdef _LP64
8916   push(rscratch1);                    // save r10, trashed by movptr()
8917 #endif
8918   push(rax);                          // save rax,
8919   push(reg);                          // pass register argument
8920   ExternalAddress buffer((address) b);
8921   // avoid using pushptr, as it modifies scratch registers
8922   // and our contract is not to modify anything
8923   movptr(rax, buffer.addr());
8924   push(rax);
8925   // call indirectly to solve generation ordering problem
8926   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
8927   call(rax);
8928   // Caller pops the arguments (oop, message) and restores rax, r10
8929 }
8930 
8931 
8932 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
8933                                                       Register tmp,
8934                                                       int offset) {
8935   intptr_t value = *delayed_value_addr;
8936   if (value != 0)
8937     return RegisterOrConstant(value + offset);
8938 
8939   // load indirectly to solve generation ordering problem
8940   movptr(tmp, ExternalAddress((address) delayed_value_addr));
8941 
8942 #ifdef ASSERT
8943   { Label L;
8944     testptr(tmp, tmp);
8945     if (WizardMode) {
8946       jcc(Assembler::notZero, L);
8947       char* buf = new char[40];
8948       sprintf(buf, "DelayedValue="INTPTR_FORMAT, delayed_value_addr[1]);
8949       stop(buf);
8950     } else {
8951       jccb(Assembler::notZero, L);
8952       hlt();
8953     }
8954     bind(L);
8955   }
8956 #endif
8957 
8958   if (offset != 0)
8959     addptr(tmp, offset);
8960 
8961   return RegisterOrConstant(tmp);
8962 }
8963 
8964 
8965 // registers on entry:
8966 //  - rax ('check' register): required MethodType
8967 //  - rcx: method handle
8968 //  - rdx, rsi, or ?: killable temp
8969 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
8970                                               Register temp_reg,
8971                                               Label& wrong_method_type) {
8972   Address type_addr(mh_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg));
8973   // compare method type against that of the receiver
8974   if (UseCompressedOops) {
8975     load_heap_oop(temp_reg, type_addr);
8976     cmpptr(mtype_reg, temp_reg);
8977   } else {
8978     cmpptr(mtype_reg, type_addr);
8979   }
8980   jcc(Assembler::notEqual, wrong_method_type);
8981 }
8982 
8983 
8984 // A method handle has a "vmslots" field which gives the size of its
8985 // argument list in JVM stack slots.  This field is either located directly
8986 // in every method handle, or else is indirectly accessed through the
8987 // method handle's MethodType.  This macro hides the distinction.
8988 void MacroAssembler::load_method_handle_vmslots(Register vmslots_reg, Register mh_reg,
8989                                                 Register temp_reg) {
8990   assert_different_registers(vmslots_reg, mh_reg, temp_reg);
8991   // load mh.type.form.vmslots
8992   Register temp2_reg = vmslots_reg;
8993   load_heap_oop(temp2_reg, Address(mh_reg,    delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg)));
8994   load_heap_oop(temp2_reg, Address(temp2_reg, delayed_value(java_lang_invoke_MethodType::form_offset_in_bytes, temp_reg)));
8995   movl(vmslots_reg, Address(temp2_reg, delayed_value(java_lang_invoke_MethodTypeForm::vmslots_offset_in_bytes, temp_reg)));
8996 }
8997 
8998 
8999 // registers on entry:
9000 //  - rcx: method handle
9001 //  - rdx: killable temp (interpreted only)
9002 //  - rax: killable temp (compiled only)
9003 void MacroAssembler::jump_to_method_handle_entry(Register mh_reg, Register temp_reg) {
9004   assert(mh_reg == rcx, "caller must put MH object in rcx");
9005   assert_different_registers(mh_reg, temp_reg);
9006 
9007   // pick out the interpreted side of the handler
9008   // NOTE: vmentry is not an oop!
9009   movptr(temp_reg, Address(mh_reg, delayed_value(java_lang_invoke_MethodHandle::vmentry_offset_in_bytes, temp_reg)));
9010 
9011   // off we go...
9012   jmp(Address(temp_reg, MethodHandleEntry::from_interpreted_entry_offset_in_bytes()));
9013 
9014   // for the various stubs which take control at this point,
9015   // see MethodHandles::generate_method_handle_stub
9016 }
9017 
9018 
9019 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
9020                                          int extra_slot_offset) {
9021   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
9022   int stackElementSize = Interpreter::stackElementSize;
9023   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
9024 #ifdef ASSERT
9025   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
9026   assert(offset1 - offset == stackElementSize, "correct arithmetic");
9027 #endif
9028   Register             scale_reg    = noreg;
9029   Address::ScaleFactor scale_factor = Address::no_scale;
9030   if (arg_slot.is_constant()) {
9031     offset += arg_slot.as_constant() * stackElementSize;
9032   } else {
9033     scale_reg    = arg_slot.as_register();
9034     scale_factor = Address::times(stackElementSize);
9035   }
9036   offset += wordSize;           // return PC is on stack
9037   return Address(rsp, scale_reg, scale_factor, offset);
9038 }
9039 
9040 
9041 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
9042   if (!VerifyOops) return;
9043 
9044   // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
9045   // Pass register number to verify_oop_subroutine
9046   char* b = new char[strlen(s) + 50];
9047   sprintf(b, "verify_oop_addr: %s", s);
9048 
9049 #ifdef _LP64
9050   push(rscratch1);                    // save r10, trashed by movptr()
9051 #endif
9052   push(rax);                          // save rax,
9053   // addr may contain rsp so we will have to adjust it based on the push
9054   // we just did (and on 64 bit we do two pushes)
9055   // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
9056   // stores rax into addr which is backwards of what was intended.
9057   if (addr.uses(rsp)) {
9058     lea(rax, addr);
9059     pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
9060   } else {
9061     pushptr(addr);
9062   }
9063 
9064   ExternalAddress buffer((address) b);
9065   // pass msg argument
9066   // avoid using pushptr, as it modifies scratch registers
9067   // and our contract is not to modify anything
9068   movptr(rax, buffer.addr());
9069   push(rax);
9070 
9071   // call indirectly to solve generation ordering problem
9072   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
9073   call(rax);
9074   // Caller pops the arguments (addr, message) and restores rax, r10.
9075 }
9076 
9077 void MacroAssembler::verify_tlab() {
9078 #ifdef ASSERT
9079   if (UseTLAB && VerifyOops) {
9080     Label next, ok;
9081     Register t1 = rsi;
9082     Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
9083 
9084     push(t1);
9085     NOT_LP64(push(thread_reg));
9086     NOT_LP64(get_thread(thread_reg));
9087 
9088     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
9089     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
9090     jcc(Assembler::aboveEqual, next);
9091     stop("assert(top >= start)");
9092     should_not_reach_here();
9093 
9094     bind(next);
9095     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
9096     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
9097     jcc(Assembler::aboveEqual, ok);
9098     stop("assert(top <= end)");
9099     should_not_reach_here();
9100 
9101     bind(ok);
9102     NOT_LP64(pop(thread_reg));
9103     pop(t1);
9104   }
9105 #endif
9106 }
9107 
9108 class ControlWord {
9109  public:
9110   int32_t _value;
9111 
9112   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
9113   int  precision_control() const       { return  (_value >>  8) & 3      ; }
9114   bool precision() const               { return ((_value >>  5) & 1) != 0; }
9115   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
9116   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
9117   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
9118   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
9119   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
9120 
9121   void print() const {
9122     // rounding control
9123     const char* rc;
9124     switch (rounding_control()) {
9125       case 0: rc = "round near"; break;
9126       case 1: rc = "round down"; break;
9127       case 2: rc = "round up  "; break;
9128       case 3: rc = "chop      "; break;
9129     };
9130     // precision control
9131     const char* pc;
9132     switch (precision_control()) {
9133       case 0: pc = "24 bits "; break;
9134       case 1: pc = "reserved"; break;
9135       case 2: pc = "53 bits "; break;
9136       case 3: pc = "64 bits "; break;
9137     };
9138     // flags
9139     char f[9];
9140     f[0] = ' ';
9141     f[1] = ' ';
9142     f[2] = (precision   ()) ? 'P' : 'p';
9143     f[3] = (underflow   ()) ? 'U' : 'u';
9144     f[4] = (overflow    ()) ? 'O' : 'o';
9145     f[5] = (zero_divide ()) ? 'Z' : 'z';
9146     f[6] = (denormalized()) ? 'D' : 'd';
9147     f[7] = (invalid     ()) ? 'I' : 'i';
9148     f[8] = '\x0';
9149     // output
9150     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
9151   }
9152 
9153 };
9154 
9155 class StatusWord {
9156  public:
9157   int32_t _value;
9158 
9159   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
9160   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
9161   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
9162   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
9163   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
9164   int  top() const                     { return  (_value >> 11) & 7      ; }
9165   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
9166   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
9167   bool precision() const               { return ((_value >>  5) & 1) != 0; }
9168   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
9169   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
9170   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
9171   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
9172   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
9173 
9174   void print() const {
9175     // condition codes
9176     char c[5];
9177     c[0] = (C3()) ? '3' : '-';
9178     c[1] = (C2()) ? '2' : '-';
9179     c[2] = (C1()) ? '1' : '-';
9180     c[3] = (C0()) ? '0' : '-';
9181     c[4] = '\x0';
9182     // flags
9183     char f[9];
9184     f[0] = (error_status()) ? 'E' : '-';
9185     f[1] = (stack_fault ()) ? 'S' : '-';
9186     f[2] = (precision   ()) ? 'P' : '-';
9187     f[3] = (underflow   ()) ? 'U' : '-';
9188     f[4] = (overflow    ()) ? 'O' : '-';
9189     f[5] = (zero_divide ()) ? 'Z' : '-';
9190     f[6] = (denormalized()) ? 'D' : '-';
9191     f[7] = (invalid     ()) ? 'I' : '-';
9192     f[8] = '\x0';
9193     // output
9194     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
9195   }
9196 
9197 };
9198 
9199 class TagWord {
9200  public:
9201   int32_t _value;
9202 
9203   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
9204 
9205   void print() const {
9206     printf("%04x", _value & 0xFFFF);
9207   }
9208 
9209 };
9210 
9211 class FPU_Register {
9212  public:
9213   int32_t _m0;
9214   int32_t _m1;
9215   int16_t _ex;
9216 
9217   bool is_indefinite() const           {
9218     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
9219   }
9220 
9221   void print() const {
9222     char  sign = (_ex < 0) ? '-' : '+';
9223     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
9224     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
9225   };
9226 
9227 };
9228 
9229 class FPU_State {
9230  public:
9231   enum {
9232     register_size       = 10,
9233     number_of_registers =  8,
9234     register_mask       =  7
9235   };
9236 
9237   ControlWord  _control_word;
9238   StatusWord   _status_word;
9239   TagWord      _tag_word;
9240   int32_t      _error_offset;
9241   int32_t      _error_selector;
9242   int32_t      _data_offset;
9243   int32_t      _data_selector;
9244   int8_t       _register[register_size * number_of_registers];
9245 
9246   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
9247   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
9248 
9249   const char* tag_as_string(int tag) const {
9250     switch (tag) {
9251       case 0: return "valid";
9252       case 1: return "zero";
9253       case 2: return "special";
9254       case 3: return "empty";
9255     }
9256     ShouldNotReachHere();
9257     return NULL;
9258   }
9259 
9260   void print() const {
9261     // print computation registers
9262     { int t = _status_word.top();
9263       for (int i = 0; i < number_of_registers; i++) {
9264         int j = (i - t) & register_mask;
9265         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
9266         st(j)->print();
9267         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
9268       }
9269     }
9270     printf("\n");
9271     // print control registers
9272     printf("ctrl = "); _control_word.print(); printf("\n");
9273     printf("stat = "); _status_word .print(); printf("\n");
9274     printf("tags = "); _tag_word    .print(); printf("\n");
9275   }
9276 
9277 };
9278 
9279 class Flag_Register {
9280  public:
9281   int32_t _value;
9282 
9283   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
9284   bool direction() const               { return ((_value >> 10) & 1) != 0; }
9285   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
9286   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
9287   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
9288   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
9289   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
9290 
9291   void print() const {
9292     // flags
9293     char f[8];
9294     f[0] = (overflow       ()) ? 'O' : '-';
9295     f[1] = (direction      ()) ? 'D' : '-';
9296     f[2] = (sign           ()) ? 'S' : '-';
9297     f[3] = (zero           ()) ? 'Z' : '-';
9298     f[4] = (auxiliary_carry()) ? 'A' : '-';
9299     f[5] = (parity         ()) ? 'P' : '-';
9300     f[6] = (carry          ()) ? 'C' : '-';
9301     f[7] = '\x0';
9302     // output
9303     printf("%08x  flags = %s", _value, f);
9304   }
9305 
9306 };
9307 
9308 class IU_Register {
9309  public:
9310   int32_t _value;
9311 
9312   void print() const {
9313     printf("%08x  %11d", _value, _value);
9314   }
9315 
9316 };
9317 
9318 class IU_State {
9319  public:
9320   Flag_Register _eflags;
9321   IU_Register   _rdi;
9322   IU_Register   _rsi;
9323   IU_Register   _rbp;
9324   IU_Register   _rsp;
9325   IU_Register   _rbx;
9326   IU_Register   _rdx;
9327   IU_Register   _rcx;
9328   IU_Register   _rax;
9329 
9330   void print() const {
9331     // computation registers
9332     printf("rax,  = "); _rax.print(); printf("\n");
9333     printf("rbx,  = "); _rbx.print(); printf("\n");
9334     printf("rcx  = "); _rcx.print(); printf("\n");
9335     printf("rdx  = "); _rdx.print(); printf("\n");
9336     printf("rdi  = "); _rdi.print(); printf("\n");
9337     printf("rsi  = "); _rsi.print(); printf("\n");
9338     printf("rbp,  = "); _rbp.print(); printf("\n");
9339     printf("rsp  = "); _rsp.print(); printf("\n");
9340     printf("\n");
9341     // control registers
9342     printf("flgs = "); _eflags.print(); printf("\n");
9343   }
9344 };
9345 
9346 
9347 class CPU_State {
9348  public:
9349   FPU_State _fpu_state;
9350   IU_State  _iu_state;
9351 
9352   void print() const {
9353     printf("--------------------------------------------------\n");
9354     _iu_state .print();
9355     printf("\n");
9356     _fpu_state.print();
9357     printf("--------------------------------------------------\n");
9358   }
9359 
9360 };
9361 
9362 
9363 static void _print_CPU_state(CPU_State* state) {
9364   state->print();
9365 };
9366 
9367 
9368 void MacroAssembler::print_CPU_state() {
9369   push_CPU_state();
9370   push(rsp);                // pass CPU state
9371   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
9372   addptr(rsp, wordSize);       // discard argument
9373   pop_CPU_state();
9374 }
9375 
9376 
9377 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
9378   static int counter = 0;
9379   FPU_State* fs = &state->_fpu_state;
9380   counter++;
9381   // For leaf calls, only verify that the top few elements remain empty.
9382   // We only need 1 empty at the top for C2 code.
9383   if( stack_depth < 0 ) {
9384     if( fs->tag_for_st(7) != 3 ) {
9385       printf("FPR7 not empty\n");
9386       state->print();
9387       assert(false, "error");
9388       return false;
9389     }
9390     return true;                // All other stack states do not matter
9391   }
9392 
9393   assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
9394          "bad FPU control word");
9395 
9396   // compute stack depth
9397   int i = 0;
9398   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
9399   int d = i;
9400   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
9401   // verify findings
9402   if (i != FPU_State::number_of_registers) {
9403     // stack not contiguous
9404     printf("%s: stack not contiguous at ST%d\n", s, i);
9405     state->print();
9406     assert(false, "error");
9407     return false;
9408   }
9409   // check if computed stack depth corresponds to expected stack depth
9410   if (stack_depth < 0) {
9411     // expected stack depth is -stack_depth or less
9412     if (d > -stack_depth) {
9413       // too many elements on the stack
9414       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
9415       state->print();
9416       assert(false, "error");
9417       return false;
9418     }
9419   } else {
9420     // expected stack depth is stack_depth
9421     if (d != stack_depth) {
9422       // wrong stack depth
9423       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
9424       state->print();
9425       assert(false, "error");
9426       return false;
9427     }
9428   }
9429   // everything is cool
9430   return true;
9431 }
9432 
9433 
9434 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
9435   if (!VerifyFPU) return;
9436   push_CPU_state();
9437   push(rsp);                // pass CPU state
9438   ExternalAddress msg((address) s);
9439   // pass message string s
9440   pushptr(msg.addr());
9441   push(stack_depth);        // pass stack depth
9442   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
9443   addptr(rsp, 3 * wordSize);   // discard arguments
9444   // check for error
9445   { Label L;
9446     testl(rax, rax);
9447     jcc(Assembler::notZero, L);
9448     int3();                  // break if error condition
9449     bind(L);
9450   }
9451   pop_CPU_state();
9452 }
9453 
9454 void MacroAssembler::load_klass(Register dst, Register src) {
9455 #ifdef _LP64
9456   if (UseCompressedOops) {
9457     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
9458     decode_heap_oop_not_null(dst);
9459   } else
9460 #endif
9461     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
9462 }
9463 
9464 void MacroAssembler::load_prototype_header(Register dst, Register src) {
9465 #ifdef _LP64
9466   if (UseCompressedOops) {
9467     assert (Universe::heap() != NULL, "java heap should be initialized");
9468     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
9469     if (Universe::narrow_oop_shift() != 0) {
9470       assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
9471       if (LogMinObjAlignmentInBytes == Address::times_8) {
9472         movq(dst, Address(r12_heapbase, dst, Address::times_8, Klass::prototype_header_offset()));
9473       } else {
9474         // OK to use shift since we don't need to preserve flags.
9475         shlq(dst, LogMinObjAlignmentInBytes);
9476         movq(dst, Address(r12_heapbase, dst, Address::times_1, Klass::prototype_header_offset()));
9477       }
9478     } else {
9479       movq(dst, Address(dst, Klass::prototype_header_offset()));
9480     }
9481   } else
9482 #endif
9483   {
9484     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
9485     movptr(dst, Address(dst, Klass::prototype_header_offset()));
9486   }
9487 }
9488 
9489 void MacroAssembler::store_klass(Register dst, Register src) {
9490 #ifdef _LP64
9491   if (UseCompressedOops) {
9492     encode_heap_oop_not_null(src);
9493     movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
9494   } else
9495 #endif
9496     movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
9497 }
9498 
9499 void MacroAssembler::load_heap_oop(Register dst, Address src) {
9500 #ifdef _LP64
9501   if (UseCompressedOops) {
9502     movl(dst, src);
9503     decode_heap_oop(dst);
9504   } else
9505 #endif
9506     movptr(dst, src);
9507 }
9508 
9509 // Doesn't do verfication, generates fixed size code
9510 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src) {
9511 #ifdef _LP64
9512   if (UseCompressedOops) {
9513     movl(dst, src);
9514     decode_heap_oop_not_null(dst);
9515   } else
9516 #endif
9517     movptr(dst, src);
9518 }
9519 
9520 void MacroAssembler::store_heap_oop(Address dst, Register src) {
9521 #ifdef _LP64
9522   if (UseCompressedOops) {
9523     assert(!dst.uses(src), "not enough registers");
9524     encode_heap_oop(src);
9525     movl(dst, src);
9526   } else
9527 #endif
9528     movptr(dst, src);
9529 }
9530 
9531 // Used for storing NULLs.
9532 void MacroAssembler::store_heap_oop_null(Address dst) {
9533 #ifdef _LP64
9534   if (UseCompressedOops) {
9535     movl(dst, (int32_t)NULL_WORD);
9536   } else {
9537     movslq(dst, (int32_t)NULL_WORD);
9538   }
9539 #else
9540   movl(dst, (int32_t)NULL_WORD);
9541 #endif
9542 }
9543 
9544 #ifdef _LP64
9545 void MacroAssembler::store_klass_gap(Register dst, Register src) {
9546   if (UseCompressedOops) {
9547     // Store to klass gap in destination
9548     movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
9549   }
9550 }
9551 
9552 #ifdef ASSERT
9553 void MacroAssembler::verify_heapbase(const char* msg) {
9554   assert (UseCompressedOops, "should be compressed");
9555   assert (Universe::heap() != NULL, "java heap should be initialized");
9556   if (CheckCompressedOops) {
9557     Label ok;
9558     push(rscratch1); // cmpptr trashes rscratch1
9559     cmpptr(r12_heapbase, ExternalAddress((address)Universe::narrow_oop_base_addr()));
9560     jcc(Assembler::equal, ok);
9561     stop(msg);
9562     bind(ok);
9563     pop(rscratch1);
9564   }
9565 }
9566 #endif
9567 
9568 // Algorithm must match oop.inline.hpp encode_heap_oop.
9569 void MacroAssembler::encode_heap_oop(Register r) {
9570 #ifdef ASSERT
9571   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
9572 #endif
9573   verify_oop(r, "broken oop in encode_heap_oop");
9574   if (Universe::narrow_oop_base() == NULL) {
9575     if (Universe::narrow_oop_shift() != 0) {
9576       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
9577       shrq(r, LogMinObjAlignmentInBytes);
9578     }
9579     return;
9580   }
9581   testq(r, r);
9582   cmovq(Assembler::equal, r, r12_heapbase);
9583   subq(r, r12_heapbase);
9584   shrq(r, LogMinObjAlignmentInBytes);
9585 }
9586 
9587 void MacroAssembler::encode_heap_oop_not_null(Register r) {
9588 #ifdef ASSERT
9589   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
9590   if (CheckCompressedOops) {
9591     Label ok;
9592     testq(r, r);
9593     jcc(Assembler::notEqual, ok);
9594     stop("null oop passed to encode_heap_oop_not_null");
9595     bind(ok);
9596   }
9597 #endif
9598   verify_oop(r, "broken oop in encode_heap_oop_not_null");
9599   if (Universe::narrow_oop_base() != NULL) {
9600     subq(r, r12_heapbase);
9601   }
9602   if (Universe::narrow_oop_shift() != 0) {
9603     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
9604     shrq(r, LogMinObjAlignmentInBytes);
9605   }
9606 }
9607 
9608 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
9609 #ifdef ASSERT
9610   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
9611   if (CheckCompressedOops) {
9612     Label ok;
9613     testq(src, src);
9614     jcc(Assembler::notEqual, ok);
9615     stop("null oop passed to encode_heap_oop_not_null2");
9616     bind(ok);
9617   }
9618 #endif
9619   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
9620   if (dst != src) {
9621     movq(dst, src);
9622   }
9623   if (Universe::narrow_oop_base() != NULL) {
9624     subq(dst, r12_heapbase);
9625   }
9626   if (Universe::narrow_oop_shift() != 0) {
9627     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
9628     shrq(dst, LogMinObjAlignmentInBytes);
9629   }
9630 }
9631 
9632 void  MacroAssembler::decode_heap_oop(Register r) {
9633 #ifdef ASSERT
9634   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
9635 #endif
9636   if (Universe::narrow_oop_base() == NULL) {
9637     if (Universe::narrow_oop_shift() != 0) {
9638       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
9639       shlq(r, LogMinObjAlignmentInBytes);
9640     }
9641   } else {
9642     Label done;
9643     shlq(r, LogMinObjAlignmentInBytes);
9644     jccb(Assembler::equal, done);
9645     addq(r, r12_heapbase);
9646     bind(done);
9647   }
9648   verify_oop(r, "broken oop in decode_heap_oop");
9649 }
9650 
9651 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
9652   // Note: it will change flags
9653   assert (UseCompressedOops, "should only be used for compressed headers");
9654   assert (Universe::heap() != NULL, "java heap should be initialized");
9655   // Cannot assert, unverified entry point counts instructions (see .ad file)
9656   // vtableStubs also counts instructions in pd_code_size_limit.
9657   // Also do not verify_oop as this is called by verify_oop.
9658   if (Universe::narrow_oop_shift() != 0) {
9659     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
9660     shlq(r, LogMinObjAlignmentInBytes);
9661     if (Universe::narrow_oop_base() != NULL) {
9662       addq(r, r12_heapbase);
9663     }
9664   } else {
9665     assert (Universe::narrow_oop_base() == NULL, "sanity");
9666   }
9667 }
9668 
9669 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
9670   // Note: it will change flags
9671   assert (UseCompressedOops, "should only be used for compressed headers");
9672   assert (Universe::heap() != NULL, "java heap should be initialized");
9673   // Cannot assert, unverified entry point counts instructions (see .ad file)
9674   // vtableStubs also counts instructions in pd_code_size_limit.
9675   // Also do not verify_oop as this is called by verify_oop.
9676   if (Universe::narrow_oop_shift() != 0) {
9677     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
9678     if (LogMinObjAlignmentInBytes == Address::times_8) {
9679       leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
9680     } else {
9681       if (dst != src) {
9682         movq(dst, src);
9683       }
9684       shlq(dst, LogMinObjAlignmentInBytes);
9685       if (Universe::narrow_oop_base() != NULL) {
9686         addq(dst, r12_heapbase);
9687       }
9688     }
9689   } else {
9690     assert (Universe::narrow_oop_base() == NULL, "sanity");
9691     if (dst != src) {
9692       movq(dst, src);
9693     }
9694   }
9695 }
9696 
9697 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
9698   assert (UseCompressedOops, "should only be used for compressed headers");
9699   assert (Universe::heap() != NULL, "java heap should be initialized");
9700   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
9701   int oop_index = oop_recorder()->find_index(obj);
9702   RelocationHolder rspec = oop_Relocation::spec(oop_index);
9703   mov_narrow_oop(dst, oop_index, rspec);
9704 }
9705 
9706 void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
9707   assert (UseCompressedOops, "should only be used for compressed headers");
9708   assert (Universe::heap() != NULL, "java heap should be initialized");
9709   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
9710   int oop_index = oop_recorder()->find_index(obj);
9711   RelocationHolder rspec = oop_Relocation::spec(oop_index);
9712   mov_narrow_oop(dst, oop_index, rspec);
9713 }
9714 
9715 void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
9716   assert (UseCompressedOops, "should only be used for compressed headers");
9717   assert (Universe::heap() != NULL, "java heap should be initialized");
9718   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
9719   int oop_index = oop_recorder()->find_index(obj);
9720   RelocationHolder rspec = oop_Relocation::spec(oop_index);
9721   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
9722 }
9723 
9724 void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
9725   assert (UseCompressedOops, "should only be used for compressed headers");
9726   assert (Universe::heap() != NULL, "java heap should be initialized");
9727   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
9728   int oop_index = oop_recorder()->find_index(obj);
9729   RelocationHolder rspec = oop_Relocation::spec(oop_index);
9730   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
9731 }
9732 
9733 void MacroAssembler::reinit_heapbase() {
9734   if (UseCompressedOops) {
9735     movptr(r12_heapbase, ExternalAddress((address)Universe::narrow_oop_base_addr()));
9736   }
9737 }
9738 #endif // _LP64
9739 
9740 
9741 // C2 compiled method's prolog code.
9742 void MacroAssembler::verified_entry(int framesize, bool stack_bang, bool fp_mode_24b) {
9743 
9744   // WARNING: Initial instruction MUST be 5 bytes or longer so that
9745   // NativeJump::patch_verified_entry will be able to patch out the entry
9746   // code safely. The push to verify stack depth is ok at 5 bytes,
9747   // the frame allocation can be either 3 or 6 bytes. So if we don't do
9748   // stack bang then we must use the 6 byte frame allocation even if
9749   // we have no frame. :-(
9750 
9751   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
9752   // Remove word for return addr
9753   framesize -= wordSize;
9754 
9755   // Calls to C2R adapters often do not accept exceptional returns.
9756   // We require that their callers must bang for them.  But be careful, because
9757   // some VM calls (such as call site linkage) can use several kilobytes of
9758   // stack.  But the stack safety zone should account for that.
9759   // See bugs 4446381, 4468289, 4497237.
9760   if (stack_bang) {
9761     generate_stack_overflow_check(framesize);
9762 
9763     // We always push rbp, so that on return to interpreter rbp, will be
9764     // restored correctly and we can correct the stack.
9765     push(rbp);
9766     // Remove word for ebp
9767     framesize -= wordSize;
9768 
9769     // Create frame
9770     if (framesize) {
9771       subptr(rsp, framesize);
9772     }
9773   } else {
9774     // Create frame (force generation of a 4 byte immediate value)
9775     subptr_imm32(rsp, framesize);
9776 
9777     // Save RBP register now.
9778     framesize -= wordSize;
9779     movptr(Address(rsp, framesize), rbp);
9780   }
9781 
9782   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
9783     framesize -= wordSize;
9784     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
9785   }
9786 
9787 #ifndef _LP64
9788   // If method sets FPU control word do it now
9789   if (fp_mode_24b) {
9790     fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
9791   }
9792   if (UseSSE >= 2 && VerifyFPU) {
9793     verify_FPU(0, "FPU stack must be clean on entry");
9794   }
9795 #endif
9796 
9797 #ifdef ASSERT
9798   if (VerifyStackAtCalls) {
9799     Label L;
9800     push(rax);
9801     mov(rax, rsp);
9802     andptr(rax, StackAlignmentInBytes-1);
9803     cmpptr(rax, StackAlignmentInBytes-wordSize);
9804     pop(rax);
9805     jcc(Assembler::equal, L);
9806     stop("Stack is not properly aligned!");
9807     bind(L);
9808   }
9809 #endif
9810 
9811 }
9812 
9813 
9814 // IndexOf for constant substrings with size >= 8 chars
9815 // which don't need to be loaded through stack.
9816 void MacroAssembler::string_indexofC8(Register str1, Register str2,
9817                                       Register cnt1, Register cnt2,
9818                                       int int_cnt2,  Register result,
9819                                       XMMRegister vec, Register tmp) {
9820   ShortBranchVerifier sbv(this);
9821   assert(UseSSE42Intrinsics, "SSE4.2 is required");
9822 
9823   // This method uses pcmpestri inxtruction with bound registers
9824   //   inputs:
9825   //     xmm - substring
9826   //     rax - substring length (elements count)
9827   //     mem - scanned string
9828   //     rdx - string length (elements count)
9829   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
9830   //   outputs:
9831   //     rcx - matched index in string
9832   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
9833 
9834   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
9835         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
9836         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
9837 
9838   // Note, inline_string_indexOf() generates checks:
9839   // if (substr.count > string.count) return -1;
9840   // if (substr.count == 0) return 0;
9841   assert(int_cnt2 >= 8, "this code isused only for cnt2 >= 8 chars");
9842 
9843   // Load substring.
9844   movdqu(vec, Address(str2, 0));
9845   movl(cnt2, int_cnt2);
9846   movptr(result, str1); // string addr
9847 
9848   if (int_cnt2 > 8) {
9849     jmpb(SCAN_TO_SUBSTR);
9850 
9851     // Reload substr for rescan, this code
9852     // is executed only for large substrings (> 8 chars)
9853     bind(RELOAD_SUBSTR);
9854     movdqu(vec, Address(str2, 0));
9855     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
9856 
9857     bind(RELOAD_STR);
9858     // We came here after the beginning of the substring was
9859     // matched but the rest of it was not so we need to search
9860     // again. Start from the next element after the previous match.
9861 
9862     // cnt2 is number of substring reminding elements and
9863     // cnt1 is number of string reminding elements when cmp failed.
9864     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
9865     subl(cnt1, cnt2);
9866     addl(cnt1, int_cnt2);
9867     movl(cnt2, int_cnt2); // Now restore cnt2
9868 
9869     decrementl(cnt1);     // Shift to next element
9870     cmpl(cnt1, cnt2);
9871     jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
9872 
9873     addptr(result, 2);
9874 
9875   } // (int_cnt2 > 8)
9876 
9877   // Scan string for start of substr in 16-byte vectors
9878   bind(SCAN_TO_SUBSTR);
9879   pcmpestri(vec, Address(result, 0), 0x0d);
9880   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
9881   subl(cnt1, 8);
9882   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
9883   cmpl(cnt1, cnt2);
9884   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
9885   addptr(result, 16);
9886   jmpb(SCAN_TO_SUBSTR);
9887 
9888   // Found a potential substr
9889   bind(FOUND_CANDIDATE);
9890   // Matched whole vector if first element matched (tmp(rcx) == 0).
9891   if (int_cnt2 == 8) {
9892     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
9893   } else { // int_cnt2 > 8
9894     jccb(Assembler::overflow, FOUND_SUBSTR);
9895   }
9896   // After pcmpestri tmp(rcx) contains matched element index
9897   // Compute start addr of substr
9898   lea(result, Address(result, tmp, Address::times_2));
9899 
9900   // Make sure string is still long enough
9901   subl(cnt1, tmp);
9902   cmpl(cnt1, cnt2);
9903   if (int_cnt2 == 8) {
9904     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
9905   } else { // int_cnt2 > 8
9906     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
9907   }
9908   // Left less then substring.
9909 
9910   bind(RET_NOT_FOUND);
9911   movl(result, -1);
9912   jmpb(EXIT);
9913 
9914   if (int_cnt2 > 8) {
9915     // This code is optimized for the case when whole substring
9916     // is matched if its head is matched.
9917     bind(MATCH_SUBSTR_HEAD);
9918     pcmpestri(vec, Address(result, 0), 0x0d);
9919     // Reload only string if does not match
9920     jccb(Assembler::noOverflow, RELOAD_STR); // OF == 0
9921 
9922     Label CONT_SCAN_SUBSTR;
9923     // Compare the rest of substring (> 8 chars).
9924     bind(FOUND_SUBSTR);
9925     // First 8 chars are already matched.
9926     negptr(cnt2);
9927     addptr(cnt2, 8);
9928 
9929     bind(SCAN_SUBSTR);
9930     subl(cnt1, 8);
9931     cmpl(cnt2, -8); // Do not read beyond substring
9932     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
9933     // Back-up strings to avoid reading beyond substring:
9934     // cnt1 = cnt1 - cnt2 + 8
9935     addl(cnt1, cnt2); // cnt2 is negative
9936     addl(cnt1, 8);
9937     movl(cnt2, 8); negptr(cnt2);
9938     bind(CONT_SCAN_SUBSTR);
9939     if (int_cnt2 < (int)G) {
9940       movdqu(vec, Address(str2, cnt2, Address::times_2, int_cnt2*2));
9941       pcmpestri(vec, Address(result, cnt2, Address::times_2, int_cnt2*2), 0x0d);
9942     } else {
9943       // calculate index in register to avoid integer overflow (int_cnt2*2)
9944       movl(tmp, int_cnt2);
9945       addptr(tmp, cnt2);
9946       movdqu(vec, Address(str2, tmp, Address::times_2, 0));
9947       pcmpestri(vec, Address(result, tmp, Address::times_2, 0), 0x0d);
9948     }
9949     // Need to reload strings pointers if not matched whole vector
9950     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
9951     addptr(cnt2, 8);
9952     jcc(Assembler::negative, SCAN_SUBSTR);
9953     // Fall through if found full substring
9954 
9955   } // (int_cnt2 > 8)
9956 
9957   bind(RET_FOUND);
9958   // Found result if we matched full small substring.
9959   // Compute substr offset
9960   subptr(result, str1);
9961   shrl(result, 1); // index
9962   bind(EXIT);
9963 
9964 } // string_indexofC8
9965 
9966 // Small strings are loaded through stack if they cross page boundary.
9967 void MacroAssembler::string_indexof(Register str1, Register str2,
9968                                     Register cnt1, Register cnt2,
9969                                     int int_cnt2,  Register result,
9970                                     XMMRegister vec, Register tmp) {
9971   ShortBranchVerifier sbv(this);
9972   assert(UseSSE42Intrinsics, "SSE4.2 is required");
9973   //
9974   // int_cnt2 is length of small (< 8 chars) constant substring
9975   // or (-1) for non constant substring in which case its length
9976   // is in cnt2 register.
9977   //
9978   // Note, inline_string_indexOf() generates checks:
9979   // if (substr.count > string.count) return -1;
9980   // if (substr.count == 0) return 0;
9981   //
9982   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < 8), "should be != 0");
9983 
9984   // This method uses pcmpestri inxtruction with bound registers
9985   //   inputs:
9986   //     xmm - substring
9987   //     rax - substring length (elements count)
9988   //     mem - scanned string
9989   //     rdx - string length (elements count)
9990   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
9991   //   outputs:
9992   //     rcx - matched index in string
9993   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
9994 
9995   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
9996         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
9997         FOUND_CANDIDATE;
9998 
9999   { //========================================================
10000     // We don't know where these strings are located
10001     // and we can't read beyond them. Load them through stack.
10002     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
10003 
10004     movptr(tmp, rsp); // save old SP
10005 
10006     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
10007       if (int_cnt2 == 1) {  // One char
10008         load_unsigned_short(result, Address(str2, 0));
10009         movdl(vec, result); // move 32 bits
10010       } else if (int_cnt2 == 2) { // Two chars
10011         movdl(vec, Address(str2, 0)); // move 32 bits
10012       } else if (int_cnt2 == 4) { // Four chars
10013         movq(vec, Address(str2, 0));  // move 64 bits
10014       } else { // cnt2 = { 3, 5, 6, 7 }
10015         // Array header size is 12 bytes in 32-bit VM
10016         // + 6 bytes for 3 chars == 18 bytes,
10017         // enough space to load vec and shift.
10018         assert(HeapWordSize*typeArrayKlass::header_size() >= 12,"sanity");
10019         movdqu(vec, Address(str2, (int_cnt2*2)-16));
10020         psrldq(vec, 16-(int_cnt2*2));
10021       }
10022     } else { // not constant substring
10023       cmpl(cnt2, 8);
10024       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
10025 
10026       // We can read beyond string if srt+16 does not cross page boundary
10027       // since heaps are aligned and mapped by pages.
10028       assert(os::vm_page_size() < (int)G, "default page should be small");
10029       movl(result, str2); // We need only low 32 bits
10030       andl(result, (os::vm_page_size()-1));
10031       cmpl(result, (os::vm_page_size()-16));
10032       jccb(Assembler::belowEqual, CHECK_STR);
10033 
10034       // Move small strings to stack to allow load 16 bytes into vec.
10035       subptr(rsp, 16);
10036       int stk_offset = wordSize-2;
10037       push(cnt2);
10038 
10039       bind(COPY_SUBSTR);
10040       load_unsigned_short(result, Address(str2, cnt2, Address::times_2, -2));
10041       movw(Address(rsp, cnt2, Address::times_2, stk_offset), result);
10042       decrement(cnt2);
10043       jccb(Assembler::notZero, COPY_SUBSTR);
10044 
10045       pop(cnt2);
10046       movptr(str2, rsp);  // New substring address
10047     } // non constant
10048 
10049     bind(CHECK_STR);
10050     cmpl(cnt1, 8);
10051     jccb(Assembler::aboveEqual, BIG_STRINGS);
10052 
10053     // Check cross page boundary.
10054     movl(result, str1); // We need only low 32 bits
10055     andl(result, (os::vm_page_size()-1));
10056     cmpl(result, (os::vm_page_size()-16));
10057     jccb(Assembler::belowEqual, BIG_STRINGS);
10058 
10059     subptr(rsp, 16);
10060     int stk_offset = -2;
10061     if (int_cnt2 < 0) { // not constant
10062       push(cnt2);
10063       stk_offset += wordSize;
10064     }
10065     movl(cnt2, cnt1);
10066 
10067     bind(COPY_STR);
10068     load_unsigned_short(result, Address(str1, cnt2, Address::times_2, -2));
10069     movw(Address(rsp, cnt2, Address::times_2, stk_offset), result);
10070     decrement(cnt2);
10071     jccb(Assembler::notZero, COPY_STR);
10072 
10073     if (int_cnt2 < 0) { // not constant
10074       pop(cnt2);
10075     }
10076     movptr(str1, rsp);  // New string address
10077 
10078     bind(BIG_STRINGS);
10079     // Load substring.
10080     if (int_cnt2 < 0) { // -1
10081       movdqu(vec, Address(str2, 0));
10082       push(cnt2);       // substr count
10083       push(str2);       // substr addr
10084       push(str1);       // string addr
10085     } else {
10086       // Small (< 8 chars) constant substrings are loaded already.
10087       movl(cnt2, int_cnt2);
10088     }
10089     push(tmp);  // original SP
10090 
10091   } // Finished loading
10092 
10093   //========================================================
10094   // Start search
10095   //
10096 
10097   movptr(result, str1); // string addr
10098 
10099   if (int_cnt2  < 0) {  // Only for non constant substring
10100     jmpb(SCAN_TO_SUBSTR);
10101 
10102     // SP saved at sp+0
10103     // String saved at sp+1*wordSize
10104     // Substr saved at sp+2*wordSize
10105     // Substr count saved at sp+3*wordSize
10106 
10107     // Reload substr for rescan, this code
10108     // is executed only for large substrings (> 8 chars)
10109     bind(RELOAD_SUBSTR);
10110     movptr(str2, Address(rsp, 2*wordSize));
10111     movl(cnt2, Address(rsp, 3*wordSize));
10112     movdqu(vec, Address(str2, 0));
10113     // We came here after the beginning of the substring was
10114     // matched but the rest of it was not so we need to search
10115     // again. Start from the next element after the previous match.
10116     subptr(str1, result); // Restore counter
10117     shrl(str1, 1);
10118     addl(cnt1, str1);
10119     decrementl(cnt1);   // Shift to next element
10120     cmpl(cnt1, cnt2);
10121     jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
10122 
10123     addptr(result, 2);
10124   } // non constant
10125 
10126   // Scan string for start of substr in 16-byte vectors
10127   bind(SCAN_TO_SUBSTR);
10128   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
10129   pcmpestri(vec, Address(result, 0), 0x0d);
10130   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
10131   subl(cnt1, 8);
10132   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
10133   cmpl(cnt1, cnt2);
10134   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
10135   addptr(result, 16);
10136 
10137   bind(ADJUST_STR);
10138   cmpl(cnt1, 8); // Do not read beyond string
10139   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
10140   // Back-up string to avoid reading beyond string.
10141   lea(result, Address(result, cnt1, Address::times_2, -16));
10142   movl(cnt1, 8);
10143   jmpb(SCAN_TO_SUBSTR);
10144 
10145   // Found a potential substr
10146   bind(FOUND_CANDIDATE);
10147   // After pcmpestri tmp(rcx) contains matched element index
10148 
10149   // Make sure string is still long enough
10150   subl(cnt1, tmp);
10151   cmpl(cnt1, cnt2);
10152   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
10153   // Left less then substring.
10154 
10155   bind(RET_NOT_FOUND);
10156   movl(result, -1);
10157   jmpb(CLEANUP);
10158 
10159   bind(FOUND_SUBSTR);
10160   // Compute start addr of substr
10161   lea(result, Address(result, tmp, Address::times_2));
10162 
10163   if (int_cnt2 > 0) { // Constant substring
10164     // Repeat search for small substring (< 8 chars)
10165     // from new point without reloading substring.
10166     // Have to check that we don't read beyond string.
10167     cmpl(tmp, 8-int_cnt2);
10168     jccb(Assembler::greater, ADJUST_STR);
10169     // Fall through if matched whole substring.
10170   } else { // non constant
10171     assert(int_cnt2 == -1, "should be != 0");
10172 
10173     addl(tmp, cnt2);
10174     // Found result if we matched whole substring.
10175     cmpl(tmp, 8);
10176     jccb(Assembler::lessEqual, RET_FOUND);
10177 
10178     // Repeat search for small substring (<= 8 chars)
10179     // from new point 'str1' without reloading substring.
10180     cmpl(cnt2, 8);
10181     // Have to check that we don't read beyond string.
10182     jccb(Assembler::lessEqual, ADJUST_STR);
10183 
10184     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
10185     // Compare the rest of substring (> 8 chars).
10186     movptr(str1, result);
10187 
10188     cmpl(tmp, cnt2);
10189     // First 8 chars are already matched.
10190     jccb(Assembler::equal, CHECK_NEXT);
10191 
10192     bind(SCAN_SUBSTR);
10193     pcmpestri(vec, Address(str1, 0), 0x0d);
10194     // Need to reload strings pointers if not matched whole vector
10195     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
10196 
10197     bind(CHECK_NEXT);
10198     subl(cnt2, 8);
10199     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
10200     addptr(str1, 16);
10201     addptr(str2, 16);
10202     subl(cnt1, 8);
10203     cmpl(cnt2, 8); // Do not read beyond substring
10204     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
10205     // Back-up strings to avoid reading beyond substring.
10206     lea(str2, Address(str2, cnt2, Address::times_2, -16));
10207     lea(str1, Address(str1, cnt2, Address::times_2, -16));
10208     subl(cnt1, cnt2);
10209     movl(cnt2, 8);
10210     addl(cnt1, 8);
10211     bind(CONT_SCAN_SUBSTR);
10212     movdqu(vec, Address(str2, 0));
10213     jmpb(SCAN_SUBSTR);
10214 
10215     bind(RET_FOUND_LONG);
10216     movptr(str1, Address(rsp, wordSize));
10217   } // non constant
10218 
10219   bind(RET_FOUND);
10220   // Compute substr offset
10221   subptr(result, str1);
10222   shrl(result, 1); // index
10223 
10224   bind(CLEANUP);
10225   pop(rsp); // restore SP
10226 
10227 } // string_indexof
10228 
10229 // Compare strings.
10230 void MacroAssembler::string_compare(Register str1, Register str2,
10231                                     Register cnt1, Register cnt2, Register result,
10232                                     XMMRegister vec1) {
10233   ShortBranchVerifier sbv(this);
10234   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
10235 
10236   // Compute the minimum of the string lengths and the
10237   // difference of the string lengths (stack).
10238   // Do the conditional move stuff
10239   movl(result, cnt1);
10240   subl(cnt1, cnt2);
10241   push(cnt1);
10242   cmov32(Assembler::lessEqual, cnt2, result);
10243 
10244   // Is the minimum length zero?
10245   testl(cnt2, cnt2);
10246   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
10247 
10248   // Load first characters
10249   load_unsigned_short(result, Address(str1, 0));
10250   load_unsigned_short(cnt1, Address(str2, 0));
10251 
10252   // Compare first characters
10253   subl(result, cnt1);
10254   jcc(Assembler::notZero,  POP_LABEL);
10255   decrementl(cnt2);
10256   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
10257 
10258   {
10259     // Check after comparing first character to see if strings are equivalent
10260     Label LSkip2;
10261     // Check if the strings start at same location
10262     cmpptr(str1, str2);
10263     jccb(Assembler::notEqual, LSkip2);
10264 
10265     // Check if the length difference is zero (from stack)
10266     cmpl(Address(rsp, 0), 0x0);
10267     jcc(Assembler::equal,  LENGTH_DIFF_LABEL);
10268 
10269     // Strings might not be equivalent
10270     bind(LSkip2);
10271   }
10272 
10273   Address::ScaleFactor scale = Address::times_2;
10274   int stride = 8;
10275 
10276   // Advance to next element
10277   addptr(str1, 16/stride);
10278   addptr(str2, 16/stride);
10279 
10280   if (UseSSE42Intrinsics) {
10281     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
10282     int pcmpmask = 0x19;
10283     // Setup to compare 16-byte vectors
10284     movl(result, cnt2);
10285     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
10286     jccb(Assembler::zero, COMPARE_TAIL);
10287 
10288     lea(str1, Address(str1, result, scale));
10289     lea(str2, Address(str2, result, scale));
10290     negptr(result);
10291 
10292     // pcmpestri
10293     //   inputs:
10294     //     vec1- substring
10295     //     rax - negative string length (elements count)
10296     //     mem - scaned string
10297     //     rdx - string length (elements count)
10298     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
10299     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
10300     //   outputs:
10301     //     rcx - first mismatched element index
10302     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
10303 
10304     bind(COMPARE_WIDE_VECTORS);
10305     movdqu(vec1, Address(str1, result, scale));
10306     pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
10307     // After pcmpestri cnt1(rcx) contains mismatched element index
10308 
10309     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
10310     addptr(result, stride);
10311     subptr(cnt2, stride);
10312     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
10313 
10314     // compare wide vectors tail
10315     testl(result, result);
10316     jccb(Assembler::zero, LENGTH_DIFF_LABEL);
10317 
10318     movl(cnt2, stride);
10319     movl(result, stride);
10320     negptr(result);
10321     movdqu(vec1, Address(str1, result, scale));
10322     pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
10323     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
10324 
10325     // Mismatched characters in the vectors
10326     bind(VECTOR_NOT_EQUAL);
10327     addptr(result, cnt1);
10328     movptr(cnt2, result);
10329     load_unsigned_short(result, Address(str1, cnt2, scale));
10330     load_unsigned_short(cnt1, Address(str2, cnt2, scale));
10331     subl(result, cnt1);
10332     jmpb(POP_LABEL);
10333 
10334     bind(COMPARE_TAIL); // limit is zero
10335     movl(cnt2, result);
10336     // Fallthru to tail compare
10337   }
10338 
10339   // Shift str2 and str1 to the end of the arrays, negate min
10340   lea(str1, Address(str1, cnt2, scale, 0));
10341   lea(str2, Address(str2, cnt2, scale, 0));
10342   negptr(cnt2);
10343 
10344   // Compare the rest of the elements
10345   bind(WHILE_HEAD_LABEL);
10346   load_unsigned_short(result, Address(str1, cnt2, scale, 0));
10347   load_unsigned_short(cnt1, Address(str2, cnt2, scale, 0));
10348   subl(result, cnt1);
10349   jccb(Assembler::notZero, POP_LABEL);
10350   increment(cnt2);
10351   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
10352 
10353   // Strings are equal up to min length.  Return the length difference.
10354   bind(LENGTH_DIFF_LABEL);
10355   pop(result);
10356   jmpb(DONE_LABEL);
10357 
10358   // Discard the stored length difference
10359   bind(POP_LABEL);
10360   pop(cnt1);
10361 
10362   // That's it
10363   bind(DONE_LABEL);
10364 }
10365 
10366 // Compare char[] arrays aligned to 4 bytes or substrings.
10367 void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Register ary2,
10368                                         Register limit, Register result, Register chr,
10369                                         XMMRegister vec1, XMMRegister vec2) {
10370   ShortBranchVerifier sbv(this);
10371   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR;
10372 
10373   int length_offset  = arrayOopDesc::length_offset_in_bytes();
10374   int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
10375 
10376   // Check the input args
10377   cmpptr(ary1, ary2);
10378   jcc(Assembler::equal, TRUE_LABEL);
10379 
10380   if (is_array_equ) {
10381     // Need additional checks for arrays_equals.
10382     testptr(ary1, ary1);
10383     jcc(Assembler::zero, FALSE_LABEL);
10384     testptr(ary2, ary2);
10385     jcc(Assembler::zero, FALSE_LABEL);
10386 
10387     // Check the lengths
10388     movl(limit, Address(ary1, length_offset));
10389     cmpl(limit, Address(ary2, length_offset));
10390     jcc(Assembler::notEqual, FALSE_LABEL);
10391   }
10392 
10393   // count == 0
10394   testl(limit, limit);
10395   jcc(Assembler::zero, TRUE_LABEL);
10396 
10397   if (is_array_equ) {
10398     // Load array address
10399     lea(ary1, Address(ary1, base_offset));
10400     lea(ary2, Address(ary2, base_offset));
10401   }
10402 
10403   shll(limit, 1);      // byte count != 0
10404   movl(result, limit); // copy
10405 
10406   if (UseSSE42Intrinsics) {
10407     // With SSE4.2, use double quad vector compare
10408     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
10409 
10410     // Compare 16-byte vectors
10411     andl(result, 0x0000000e);  //   tail count (in bytes)
10412     andl(limit, 0xfffffff0);   // vector count (in bytes)
10413     jccb(Assembler::zero, COMPARE_TAIL);
10414 
10415     lea(ary1, Address(ary1, limit, Address::times_1));
10416     lea(ary2, Address(ary2, limit, Address::times_1));
10417     negptr(limit);
10418 
10419     bind(COMPARE_WIDE_VECTORS);
10420     movdqu(vec1, Address(ary1, limit, Address::times_1));
10421     movdqu(vec2, Address(ary2, limit, Address::times_1));
10422     pxor(vec1, vec2);
10423 
10424     ptest(vec1, vec1);
10425     jccb(Assembler::notZero, FALSE_LABEL);
10426     addptr(limit, 16);
10427     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
10428 
10429     testl(result, result);
10430     jccb(Assembler::zero, TRUE_LABEL);
10431 
10432     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
10433     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
10434     pxor(vec1, vec2);
10435 
10436     ptest(vec1, vec1);
10437     jccb(Assembler::notZero, FALSE_LABEL);
10438     jmpb(TRUE_LABEL);
10439 
10440     bind(COMPARE_TAIL); // limit is zero
10441     movl(limit, result);
10442     // Fallthru to tail compare
10443   }
10444 
10445   // Compare 4-byte vectors
10446   andl(limit, 0xfffffffc); // vector count (in bytes)
10447   jccb(Assembler::zero, COMPARE_CHAR);
10448 
10449   lea(ary1, Address(ary1, limit, Address::times_1));
10450   lea(ary2, Address(ary2, limit, Address::times_1));
10451   negptr(limit);
10452 
10453   bind(COMPARE_VECTORS);
10454   movl(chr, Address(ary1, limit, Address::times_1));
10455   cmpl(chr, Address(ary2, limit, Address::times_1));
10456   jccb(Assembler::notEqual, FALSE_LABEL);
10457   addptr(limit, 4);
10458   jcc(Assembler::notZero, COMPARE_VECTORS);
10459 
10460   // Compare trailing char (final 2 bytes), if any
10461   bind(COMPARE_CHAR);
10462   testl(result, 0x2);   // tail  char
10463   jccb(Assembler::zero, TRUE_LABEL);
10464   load_unsigned_short(chr, Address(ary1, 0));
10465   load_unsigned_short(limit, Address(ary2, 0));
10466   cmpl(chr, limit);
10467   jccb(Assembler::notEqual, FALSE_LABEL);
10468 
10469   bind(TRUE_LABEL);
10470   movl(result, 1);   // return true
10471   jmpb(DONE);
10472 
10473   bind(FALSE_LABEL);
10474   xorl(result, result); // return false
10475 
10476   // That's it
10477   bind(DONE);
10478 }
10479 
10480 #ifdef PRODUCT
10481 #define BLOCK_COMMENT(str) /* nothing */
10482 #else
10483 #define BLOCK_COMMENT(str) block_comment(str)
10484 #endif
10485 
10486 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
10487 void MacroAssembler::generate_fill(BasicType t, bool aligned,
10488                                    Register to, Register value, Register count,
10489                                    Register rtmp, XMMRegister xtmp) {
10490   ShortBranchVerifier sbv(this);
10491   assert_different_registers(to, value, count, rtmp);
10492   Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
10493   Label L_fill_2_bytes, L_fill_4_bytes;
10494 
10495   int shift = -1;
10496   switch (t) {
10497     case T_BYTE:
10498       shift = 2;
10499       break;
10500     case T_SHORT:
10501       shift = 1;
10502       break;
10503     case T_INT:
10504       shift = 0;
10505       break;
10506     default: ShouldNotReachHere();
10507   }
10508 
10509   if (t == T_BYTE) {
10510     andl(value, 0xff);
10511     movl(rtmp, value);
10512     shll(rtmp, 8);
10513     orl(value, rtmp);
10514   }
10515   if (t == T_SHORT) {
10516     andl(value, 0xffff);
10517   }
10518   if (t == T_BYTE || t == T_SHORT) {
10519     movl(rtmp, value);
10520     shll(rtmp, 16);
10521     orl(value, rtmp);
10522   }
10523 
10524   cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
10525   jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
10526   if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
10527     // align source address at 4 bytes address boundary
10528     if (t == T_BYTE) {
10529       // One byte misalignment happens only for byte arrays
10530       testptr(to, 1);
10531       jccb(Assembler::zero, L_skip_align1);
10532       movb(Address(to, 0), value);
10533       increment(to);
10534       decrement(count);
10535       BIND(L_skip_align1);
10536     }
10537     // Two bytes misalignment happens only for byte and short (char) arrays
10538     testptr(to, 2);
10539     jccb(Assembler::zero, L_skip_align2);
10540     movw(Address(to, 0), value);
10541     addptr(to, 2);
10542     subl(count, 1<<(shift-1));
10543     BIND(L_skip_align2);
10544   }
10545   if (UseSSE < 2) {
10546     Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
10547     // Fill 32-byte chunks
10548     subl(count, 8 << shift);
10549     jcc(Assembler::less, L_check_fill_8_bytes);
10550     align(16);
10551 
10552     BIND(L_fill_32_bytes_loop);
10553 
10554     for (int i = 0; i < 32; i += 4) {
10555       movl(Address(to, i), value);
10556     }
10557 
10558     addptr(to, 32);
10559     subl(count, 8 << shift);
10560     jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
10561     BIND(L_check_fill_8_bytes);
10562     addl(count, 8 << shift);
10563     jccb(Assembler::zero, L_exit);
10564     jmpb(L_fill_8_bytes);
10565 
10566     //
10567     // length is too short, just fill qwords
10568     //
10569     BIND(L_fill_8_bytes_loop);
10570     movl(Address(to, 0), value);
10571     movl(Address(to, 4), value);
10572     addptr(to, 8);
10573     BIND(L_fill_8_bytes);
10574     subl(count, 1 << (shift + 1));
10575     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
10576     // fall through to fill 4 bytes
10577   } else {
10578     Label L_fill_32_bytes;
10579     if (!UseUnalignedLoadStores) {
10580       // align to 8 bytes, we know we are 4 byte aligned to start
10581       testptr(to, 4);
10582       jccb(Assembler::zero, L_fill_32_bytes);
10583       movl(Address(to, 0), value);
10584       addptr(to, 4);
10585       subl(count, 1<<shift);
10586     }
10587     BIND(L_fill_32_bytes);
10588     {
10589       assert( UseSSE >= 2, "supported cpu only" );
10590       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
10591       // Fill 32-byte chunks
10592       movdl(xtmp, value);
10593       pshufd(xtmp, xtmp, 0);
10594 
10595       subl(count, 8 << shift);
10596       jcc(Assembler::less, L_check_fill_8_bytes);
10597       align(16);
10598 
10599       BIND(L_fill_32_bytes_loop);
10600 
10601       if (UseUnalignedLoadStores) {
10602         movdqu(Address(to, 0), xtmp);
10603         movdqu(Address(to, 16), xtmp);
10604       } else {
10605         movq(Address(to, 0), xtmp);
10606         movq(Address(to, 8), xtmp);
10607         movq(Address(to, 16), xtmp);
10608         movq(Address(to, 24), xtmp);
10609       }
10610 
10611       addptr(to, 32);
10612       subl(count, 8 << shift);
10613       jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
10614       BIND(L_check_fill_8_bytes);
10615       addl(count, 8 << shift);
10616       jccb(Assembler::zero, L_exit);
10617       jmpb(L_fill_8_bytes);
10618 
10619       //
10620       // length is too short, just fill qwords
10621       //
10622       BIND(L_fill_8_bytes_loop);
10623       movq(Address(to, 0), xtmp);
10624       addptr(to, 8);
10625       BIND(L_fill_8_bytes);
10626       subl(count, 1 << (shift + 1));
10627       jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
10628     }
10629   }
10630   // fill trailing 4 bytes
10631   BIND(L_fill_4_bytes);
10632   testl(count, 1<<shift);
10633   jccb(Assembler::zero, L_fill_2_bytes);
10634   movl(Address(to, 0), value);
10635   if (t == T_BYTE || t == T_SHORT) {
10636     addptr(to, 4);
10637     BIND(L_fill_2_bytes);
10638     // fill trailing 2 bytes
10639     testl(count, 1<<(shift-1));
10640     jccb(Assembler::zero, L_fill_byte);
10641     movw(Address(to, 0), value);
10642     if (t == T_BYTE) {
10643       addptr(to, 2);
10644       BIND(L_fill_byte);
10645       // fill trailing byte
10646       testl(count, 1);
10647       jccb(Assembler::zero, L_exit);
10648       movb(Address(to, 0), value);
10649     } else {
10650       BIND(L_fill_byte);
10651     }
10652   } else {
10653     BIND(L_fill_2_bytes);
10654   }
10655   BIND(L_exit);
10656 }
10657 #undef BIND
10658 #undef BLOCK_COMMENT
10659 
10660 
10661 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
10662   switch (cond) {
10663     // Note some conditions are synonyms for others
10664     case Assembler::zero:         return Assembler::notZero;
10665     case Assembler::notZero:      return Assembler::zero;
10666     case Assembler::less:         return Assembler::greaterEqual;
10667     case Assembler::lessEqual:    return Assembler::greater;
10668     case Assembler::greater:      return Assembler::lessEqual;
10669     case Assembler::greaterEqual: return Assembler::less;
10670     case Assembler::below:        return Assembler::aboveEqual;
10671     case Assembler::belowEqual:   return Assembler::above;
10672     case Assembler::above:        return Assembler::belowEqual;
10673     case Assembler::aboveEqual:   return Assembler::below;
10674     case Assembler::overflow:     return Assembler::noOverflow;
10675     case Assembler::noOverflow:   return Assembler::overflow;
10676     case Assembler::negative:     return Assembler::positive;
10677     case Assembler::positive:     return Assembler::negative;
10678     case Assembler::parity:       return Assembler::noParity;
10679     case Assembler::noParity:     return Assembler::parity;
10680   }
10681   ShouldNotReachHere(); return Assembler::overflow;
10682 }
10683 
10684 SkipIfEqual::SkipIfEqual(
10685     MacroAssembler* masm, const bool* flag_addr, bool value) {
10686   _masm = masm;
10687   _masm->cmp8(ExternalAddress((address)flag_addr), value);
10688   _masm->jcc(Assembler::equal, _label);
10689 }
10690 
10691 SkipIfEqual::~SkipIfEqual() {
10692   _masm->bind(_label);
10693 }