1 /*
   2  * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "assembler_x86.inline.hpp"
  27 #include "gc_interface/collectedHeap.inline.hpp"
  28 #include "interpreter/interpreter.hpp"
  29 #include "memory/cardTableModRefBS.hpp"
  30 #include "memory/resourceArea.hpp"
  31 #include "prims/methodHandles.hpp"
  32 #include "runtime/biasedLocking.hpp"
  33 #include "runtime/interfaceSupport.hpp"
  34 #include "runtime/objectMonitor.hpp"
  35 #include "runtime/os.hpp"
  36 #include "runtime/sharedRuntime.hpp"
  37 #include "runtime/stubRoutines.hpp"
  38 #ifndef SERIALGC
  39 #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
  40 #include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
  41 #include "gc_implementation/g1/heapRegion.hpp"
  42 #endif
  43 
  44 // Implementation of AddressLiteral
  45 
  46 AddressLiteral::AddressLiteral(address target, relocInfo::relocType rtype) {
  47   _is_lval = false;
  48   _target = target;
  49   switch (rtype) {
  50   case relocInfo::oop_type:
  51     // Oops are a special case. Normally they would be their own section
  52     // but in cases like icBuffer they are literals in the code stream that
  53     // we don't have a section for. We use none so that we get a literal address
  54     // which is always patchable.
  55     break;
  56   case relocInfo::external_word_type:
  57     _rspec = external_word_Relocation::spec(target);
  58     break;
  59   case relocInfo::internal_word_type:
  60     _rspec = internal_word_Relocation::spec(target);
  61     break;
  62   case relocInfo::opt_virtual_call_type:
  63     _rspec = opt_virtual_call_Relocation::spec();
  64     break;
  65   case relocInfo::static_call_type:
  66     _rspec = static_call_Relocation::spec();
  67     break;
  68   case relocInfo::runtime_call_type:
  69     _rspec = runtime_call_Relocation::spec();
  70     break;
  71   case relocInfo::poll_type:
  72   case relocInfo::poll_return_type:
  73     _rspec = Relocation::spec_simple(rtype);
  74     break;
  75   case relocInfo::none:
  76     break;
  77   default:
  78     ShouldNotReachHere();
  79     break;
  80   }
  81 }
  82 
  83 // Implementation of Address
  84 
  85 #ifdef _LP64
  86 
  87 Address Address::make_array(ArrayAddress adr) {
  88   // Not implementable on 64bit machines
  89   // Should have been handled higher up the call chain.
  90   ShouldNotReachHere();
  91   return Address();
  92 }
  93 
  94 // exceedingly dangerous constructor
  95 Address::Address(int disp, address loc, relocInfo::relocType rtype) {
  96   _base  = noreg;
  97   _index = noreg;
  98   _scale = no_scale;
  99   _disp  = disp;
 100   switch (rtype) {
 101     case relocInfo::external_word_type:
 102       _rspec = external_word_Relocation::spec(loc);
 103       break;
 104     case relocInfo::internal_word_type:
 105       _rspec = internal_word_Relocation::spec(loc);
 106       break;
 107     case relocInfo::runtime_call_type:
 108       // HMM
 109       _rspec = runtime_call_Relocation::spec();
 110       break;
 111     case relocInfo::poll_type:
 112     case relocInfo::poll_return_type:
 113       _rspec = Relocation::spec_simple(rtype);
 114       break;
 115     case relocInfo::none:
 116       break;
 117     default:
 118       ShouldNotReachHere();
 119   }
 120 }
 121 #else // LP64
 122 
 123 Address Address::make_array(ArrayAddress adr) {
 124   AddressLiteral base = adr.base();
 125   Address index = adr.index();
 126   assert(index._disp == 0, "must not have disp"); // maybe it can?
 127   Address array(index._base, index._index, index._scale, (intptr_t) base.target());
 128   array._rspec = base._rspec;
 129   return array;
 130 }
 131 
 132 // exceedingly dangerous constructor
 133 Address::Address(address loc, RelocationHolder spec) {
 134   _base  = noreg;
 135   _index = noreg;
 136   _scale = no_scale;
 137   _disp  = (intptr_t) loc;
 138   _rspec = spec;
 139 }
 140 
 141 #endif // _LP64
 142 
 143 
 144 
 145 // Convert the raw encoding form into the form expected by the constructor for
 146 // Address.  An index of 4 (rsp) corresponds to having no index, so convert
 147 // that to noreg for the Address constructor.
 148 Address Address::make_raw(int base, int index, int scale, int disp, bool disp_is_oop) {
 149   RelocationHolder rspec;
 150   if (disp_is_oop) {
 151     rspec = Relocation::spec_simple(relocInfo::oop_type);
 152   }
 153   bool valid_index = index != rsp->encoding();
 154   if (valid_index) {
 155     Address madr(as_Register(base), as_Register(index), (Address::ScaleFactor)scale, in_ByteSize(disp));
 156     madr._rspec = rspec;
 157     return madr;
 158   } else {
 159     Address madr(as_Register(base), noreg, Address::no_scale, in_ByteSize(disp));
 160     madr._rspec = rspec;
 161     return madr;
 162   }
 163 }
 164 
 165 // Implementation of Assembler
 166 
 167 int AbstractAssembler::code_fill_byte() {
 168   return (u_char)'\xF4'; // hlt
 169 }
 170 
 171 // make this go away someday
 172 void Assembler::emit_data(jint data, relocInfo::relocType rtype, int format) {
 173   if (rtype == relocInfo::none)
 174         emit_long(data);
 175   else  emit_data(data, Relocation::spec_simple(rtype), format);
 176 }
 177 
 178 void Assembler::emit_data(jint data, RelocationHolder const& rspec, int format) {
 179   assert(imm_operand == 0, "default format must be immediate in this file");
 180   assert(inst_mark() != NULL, "must be inside InstructionMark");
 181   if (rspec.type() !=  relocInfo::none) {
 182     #ifdef ASSERT
 183       check_relocation(rspec, format);
 184     #endif
 185     // Do not use AbstractAssembler::relocate, which is not intended for
 186     // embedded words.  Instead, relocate to the enclosing instruction.
 187 
 188     // hack. call32 is too wide for mask so use disp32
 189     if (format == call32_operand)
 190       code_section()->relocate(inst_mark(), rspec, disp32_operand);
 191     else
 192       code_section()->relocate(inst_mark(), rspec, format);
 193   }
 194   emit_long(data);
 195 }
 196 
 197 static int encode(Register r) {
 198   int enc = r->encoding();
 199   if (enc >= 8) {
 200     enc -= 8;
 201   }
 202   return enc;
 203 }
 204 
 205 static int encode(XMMRegister r) {
 206   int enc = r->encoding();
 207   if (enc >= 8) {
 208     enc -= 8;
 209   }
 210   return enc;
 211 }
 212 
 213 void Assembler::emit_arith_b(int op1, int op2, Register dst, int imm8) {
 214   assert(dst->has_byte_register(), "must have byte register");
 215   assert(isByte(op1) && isByte(op2), "wrong opcode");
 216   assert(isByte(imm8), "not a byte");
 217   assert((op1 & 0x01) == 0, "should be 8bit operation");
 218   emit_byte(op1);
 219   emit_byte(op2 | encode(dst));
 220   emit_byte(imm8);
 221 }
 222 
 223 
 224 void Assembler::emit_arith(int op1, int op2, Register dst, int32_t imm32) {
 225   assert(isByte(op1) && isByte(op2), "wrong opcode");
 226   assert((op1 & 0x01) == 1, "should be 32bit operation");
 227   assert((op1 & 0x02) == 0, "sign-extension bit should not be set");
 228   if (is8bit(imm32)) {
 229     emit_byte(op1 | 0x02); // set sign bit
 230     emit_byte(op2 | encode(dst));
 231     emit_byte(imm32 & 0xFF);
 232   } else {
 233     emit_byte(op1);
 234     emit_byte(op2 | encode(dst));
 235     emit_long(imm32);
 236   }
 237 }
 238 
 239 // Force generation of a 4 byte immediate value even if it fits into 8bit
 240 void Assembler::emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32) {
 241   assert(isByte(op1) && isByte(op2), "wrong opcode");
 242   assert((op1 & 0x01) == 1, "should be 32bit operation");
 243   assert((op1 & 0x02) == 0, "sign-extension bit should not be set");
 244   emit_byte(op1);
 245   emit_byte(op2 | encode(dst));
 246   emit_long(imm32);
 247 }
 248 
 249 // immediate-to-memory forms
 250 void Assembler::emit_arith_operand(int op1, Register rm, Address adr, int32_t imm32) {
 251   assert((op1 & 0x01) == 1, "should be 32bit operation");
 252   assert((op1 & 0x02) == 0, "sign-extension bit should not be set");
 253   if (is8bit(imm32)) {
 254     emit_byte(op1 | 0x02); // set sign bit
 255     emit_operand(rm, adr, 1);
 256     emit_byte(imm32 & 0xFF);
 257   } else {
 258     emit_byte(op1);
 259     emit_operand(rm, adr, 4);
 260     emit_long(imm32);
 261   }
 262 }
 263 
 264 void Assembler::emit_arith(int op1, int op2, Register dst, jobject obj) {
 265   LP64_ONLY(ShouldNotReachHere());
 266   assert(isByte(op1) && isByte(op2), "wrong opcode");
 267   assert((op1 & 0x01) == 1, "should be 32bit operation");
 268   assert((op1 & 0x02) == 0, "sign-extension bit should not be set");
 269   InstructionMark im(this);
 270   emit_byte(op1);
 271   emit_byte(op2 | encode(dst));
 272   emit_data((intptr_t)obj, relocInfo::oop_type, 0);
 273 }
 274 
 275 
 276 void Assembler::emit_arith(int op1, int op2, Register dst, Register src) {
 277   assert(isByte(op1) && isByte(op2), "wrong opcode");
 278   emit_byte(op1);
 279   emit_byte(op2 | encode(dst) << 3 | encode(src));
 280 }
 281 
 282 
 283 void Assembler::emit_operand(Register reg, Register base, Register index,
 284                              Address::ScaleFactor scale, int disp,
 285                              RelocationHolder const& rspec,
 286                              int rip_relative_correction) {
 287   relocInfo::relocType rtype = (relocInfo::relocType) rspec.type();
 288 
 289   // Encode the registers as needed in the fields they are used in
 290 
 291   int regenc = encode(reg) << 3;
 292   int indexenc = index->is_valid() ? encode(index) << 3 : 0;
 293   int baseenc = base->is_valid() ? encode(base) : 0;
 294 
 295   if (base->is_valid()) {
 296     if (index->is_valid()) {
 297       assert(scale != Address::no_scale, "inconsistent address");
 298       // [base + index*scale + disp]
 299       if (disp == 0 && rtype == relocInfo::none  &&
 300           base != rbp LP64_ONLY(&& base != r13)) {
 301         // [base + index*scale]
 302         // [00 reg 100][ss index base]
 303         assert(index != rsp, "illegal addressing mode");
 304         emit_byte(0x04 | regenc);
 305         emit_byte(scale << 6 | indexenc | baseenc);
 306       } else if (is8bit(disp) && rtype == relocInfo::none) {
 307         // [base + index*scale + imm8]
 308         // [01 reg 100][ss index base] imm8
 309         assert(index != rsp, "illegal addressing mode");
 310         emit_byte(0x44 | regenc);
 311         emit_byte(scale << 6 | indexenc | baseenc);
 312         emit_byte(disp & 0xFF);
 313       } else {
 314         // [base + index*scale + disp32]
 315         // [10 reg 100][ss index base] disp32
 316         assert(index != rsp, "illegal addressing mode");
 317         emit_byte(0x84 | regenc);
 318         emit_byte(scale << 6 | indexenc | baseenc);
 319         emit_data(disp, rspec, disp32_operand);
 320       }
 321     } else if (base == rsp LP64_ONLY(|| base == r12)) {
 322       // [rsp + disp]
 323       if (disp == 0 && rtype == relocInfo::none) {
 324         // [rsp]
 325         // [00 reg 100][00 100 100]
 326         emit_byte(0x04 | regenc);
 327         emit_byte(0x24);
 328       } else if (is8bit(disp) && rtype == relocInfo::none) {
 329         // [rsp + imm8]
 330         // [01 reg 100][00 100 100] disp8
 331         emit_byte(0x44 | regenc);
 332         emit_byte(0x24);
 333         emit_byte(disp & 0xFF);
 334       } else {
 335         // [rsp + imm32]
 336         // [10 reg 100][00 100 100] disp32
 337         emit_byte(0x84 | regenc);
 338         emit_byte(0x24);
 339         emit_data(disp, rspec, disp32_operand);
 340       }
 341     } else {
 342       // [base + disp]
 343       assert(base != rsp LP64_ONLY(&& base != r12), "illegal addressing mode");
 344       if (disp == 0 && rtype == relocInfo::none &&
 345           base != rbp LP64_ONLY(&& base != r13)) {
 346         // [base]
 347         // [00 reg base]
 348         emit_byte(0x00 | regenc | baseenc);
 349       } else if (is8bit(disp) && rtype == relocInfo::none) {
 350         // [base + disp8]
 351         // [01 reg base] disp8
 352         emit_byte(0x40 | regenc | baseenc);
 353         emit_byte(disp & 0xFF);
 354       } else {
 355         // [base + disp32]
 356         // [10 reg base] disp32
 357         emit_byte(0x80 | regenc | baseenc);
 358         emit_data(disp, rspec, disp32_operand);
 359       }
 360     }
 361   } else {
 362     if (index->is_valid()) {
 363       assert(scale != Address::no_scale, "inconsistent address");
 364       // [index*scale + disp]
 365       // [00 reg 100][ss index 101] disp32
 366       assert(index != rsp, "illegal addressing mode");
 367       emit_byte(0x04 | regenc);
 368       emit_byte(scale << 6 | indexenc | 0x05);
 369       emit_data(disp, rspec, disp32_operand);
 370     } else if (rtype != relocInfo::none ) {
 371       // [disp] (64bit) RIP-RELATIVE (32bit) abs
 372       // [00 000 101] disp32
 373 
 374       emit_byte(0x05 | regenc);
 375       // Note that the RIP-rel. correction applies to the generated
 376       // disp field, but _not_ to the target address in the rspec.
 377 
 378       // disp was created by converting the target address minus the pc
 379       // at the start of the instruction. That needs more correction here.
 380       // intptr_t disp = target - next_ip;
 381       assert(inst_mark() != NULL, "must be inside InstructionMark");
 382       address next_ip = pc() + sizeof(int32_t) + rip_relative_correction;
 383       int64_t adjusted = disp;
 384       // Do rip-rel adjustment for 64bit
 385       LP64_ONLY(adjusted -=  (next_ip - inst_mark()));
 386       assert(is_simm32(adjusted),
 387              "must be 32bit offset (RIP relative address)");
 388       emit_data((int32_t) adjusted, rspec, disp32_operand);
 389 
 390     } else {
 391       // 32bit never did this, did everything as the rip-rel/disp code above
 392       // [disp] ABSOLUTE
 393       // [00 reg 100][00 100 101] disp32
 394       emit_byte(0x04 | regenc);
 395       emit_byte(0x25);
 396       emit_data(disp, rspec, disp32_operand);
 397     }
 398   }
 399 }
 400 
 401 void Assembler::emit_operand(XMMRegister reg, Register base, Register index,
 402                              Address::ScaleFactor scale, int disp,
 403                              RelocationHolder const& rspec) {
 404   emit_operand((Register)reg, base, index, scale, disp, rspec);
 405 }
 406 
 407 // Secret local extension to Assembler::WhichOperand:
 408 #define end_pc_operand (_WhichOperand_limit)
 409 
 410 address Assembler::locate_operand(address inst, WhichOperand which) {
 411   // Decode the given instruction, and return the address of
 412   // an embedded 32-bit operand word.
 413 
 414   // If "which" is disp32_operand, selects the displacement portion
 415   // of an effective address specifier.
 416   // If "which" is imm64_operand, selects the trailing immediate constant.
 417   // If "which" is call32_operand, selects the displacement of a call or jump.
 418   // Caller is responsible for ensuring that there is such an operand,
 419   // and that it is 32/64 bits wide.
 420 
 421   // If "which" is end_pc_operand, find the end of the instruction.
 422 
 423   address ip = inst;
 424   bool is_64bit = false;
 425 
 426   debug_only(bool has_disp32 = false);
 427   int tail_size = 0; // other random bytes (#32, #16, etc.) at end of insn
 428 
 429   again_after_prefix:
 430   switch (0xFF & *ip++) {
 431 
 432   // These convenience macros generate groups of "case" labels for the switch.
 433 #define REP4(x) (x)+0: case (x)+1: case (x)+2: case (x)+3
 434 #define REP8(x) (x)+0: case (x)+1: case (x)+2: case (x)+3: \
 435              case (x)+4: case (x)+5: case (x)+6: case (x)+7
 436 #define REP16(x) REP8((x)+0): \
 437               case REP8((x)+8)
 438 
 439   case CS_segment:
 440   case SS_segment:
 441   case DS_segment:
 442   case ES_segment:
 443   case FS_segment:
 444   case GS_segment:
 445     // Seems dubious
 446     LP64_ONLY(assert(false, "shouldn't have that prefix"));
 447     assert(ip == inst+1, "only one prefix allowed");
 448     goto again_after_prefix;
 449 
 450   case 0x67:
 451   case REX:
 452   case REX_B:
 453   case REX_X:
 454   case REX_XB:
 455   case REX_R:
 456   case REX_RB:
 457   case REX_RX:
 458   case REX_RXB:
 459     NOT_LP64(assert(false, "64bit prefixes"));
 460     goto again_after_prefix;
 461 
 462   case REX_W:
 463   case REX_WB:
 464   case REX_WX:
 465   case REX_WXB:
 466   case REX_WR:
 467   case REX_WRB:
 468   case REX_WRX:
 469   case REX_WRXB:
 470     NOT_LP64(assert(false, "64bit prefixes"));
 471     is_64bit = true;
 472     goto again_after_prefix;
 473 
 474   case 0xFF: // pushq a; decl a; incl a; call a; jmp a
 475   case 0x88: // movb a, r
 476   case 0x89: // movl a, r
 477   case 0x8A: // movb r, a
 478   case 0x8B: // movl r, a
 479   case 0x8F: // popl a
 480     debug_only(has_disp32 = true);
 481     break;
 482 
 483   case 0x68: // pushq #32
 484     if (which == end_pc_operand) {
 485       return ip + 4;
 486     }
 487     assert(which == imm_operand && !is_64bit, "pushl has no disp32 or 64bit immediate");
 488     return ip;                  // not produced by emit_operand
 489 
 490   case 0x66: // movw ... (size prefix)
 491     again_after_size_prefix2:
 492     switch (0xFF & *ip++) {
 493     case REX:
 494     case REX_B:
 495     case REX_X:
 496     case REX_XB:
 497     case REX_R:
 498     case REX_RB:
 499     case REX_RX:
 500     case REX_RXB:
 501     case REX_W:
 502     case REX_WB:
 503     case REX_WX:
 504     case REX_WXB:
 505     case REX_WR:
 506     case REX_WRB:
 507     case REX_WRX:
 508     case REX_WRXB:
 509       NOT_LP64(assert(false, "64bit prefix found"));
 510       goto again_after_size_prefix2;
 511     case 0x8B: // movw r, a
 512     case 0x89: // movw a, r
 513       debug_only(has_disp32 = true);
 514       break;
 515     case 0xC7: // movw a, #16
 516       debug_only(has_disp32 = true);
 517       tail_size = 2;  // the imm16
 518       break;
 519     case 0x0F: // several SSE/SSE2 variants
 520       ip--;    // reparse the 0x0F
 521       goto again_after_prefix;
 522     default:
 523       ShouldNotReachHere();
 524     }
 525     break;
 526 
 527   case REP8(0xB8): // movl/q r, #32/#64(oop?)
 528     if (which == end_pc_operand)  return ip + (is_64bit ? 8 : 4);
 529     // these asserts are somewhat nonsensical
 530 #ifndef _LP64
 531     assert(which == imm_operand || which == disp32_operand,
 532            err_msg("which %d is_64_bit %d ip " INTPTR_FORMAT, which, is_64bit, ip));
 533 #else
 534     assert((which == call32_operand || which == imm_operand) && is_64bit ||
 535            which == narrow_oop_operand && !is_64bit,
 536            err_msg("which %d is_64_bit %d ip " INTPTR_FORMAT, which, is_64bit, ip));
 537 #endif // _LP64
 538     return ip;
 539 
 540   case 0x69: // imul r, a, #32
 541   case 0xC7: // movl a, #32(oop?)
 542     tail_size = 4;
 543     debug_only(has_disp32 = true); // has both kinds of operands!
 544     break;
 545 
 546   case 0x0F: // movx..., etc.
 547     switch (0xFF & *ip++) {
 548     case 0x3A: // pcmpestri
 549       tail_size = 1;
 550     case 0x38: // ptest, pmovzxbw
 551       ip++; // skip opcode
 552       debug_only(has_disp32 = true); // has both kinds of operands!
 553       break;
 554 
 555     case 0x70: // pshufd r, r/a, #8
 556       debug_only(has_disp32 = true); // has both kinds of operands!
 557     case 0x73: // psrldq r, #8
 558       tail_size = 1;
 559       break;
 560 
 561     case 0x12: // movlps
 562     case 0x28: // movaps
 563     case 0x2E: // ucomiss
 564     case 0x2F: // comiss
 565     case 0x54: // andps
 566     case 0x55: // andnps
 567     case 0x56: // orps
 568     case 0x57: // xorps
 569     case 0x6E: // movd
 570     case 0x7E: // movd
 571     case 0xAE: // ldmxcsr, stmxcsr, fxrstor, fxsave, clflush
 572       debug_only(has_disp32 = true);
 573       break;
 574 
 575     case 0xAD: // shrd r, a, %cl
 576     case 0xAF: // imul r, a
 577     case 0xBE: // movsbl r, a (movsxb)
 578     case 0xBF: // movswl r, a (movsxw)
 579     case 0xB6: // movzbl r, a (movzxb)
 580     case 0xB7: // movzwl r, a (movzxw)
 581     case REP16(0x40): // cmovl cc, r, a
 582     case 0xB0: // cmpxchgb
 583     case 0xB1: // cmpxchg
 584     case 0xC1: // xaddl
 585     case 0xC7: // cmpxchg8
 586     case REP16(0x90): // setcc a
 587       debug_only(has_disp32 = true);
 588       // fall out of the switch to decode the address
 589       break;
 590 
 591     case 0xC4: // pinsrw r, a, #8
 592       debug_only(has_disp32 = true);
 593     case 0xC5: // pextrw r, r, #8
 594       tail_size = 1;  // the imm8
 595       break;
 596 
 597     case 0xAC: // shrd r, a, #8
 598       debug_only(has_disp32 = true);
 599       tail_size = 1;  // the imm8
 600       break;
 601 
 602     case REP16(0x80): // jcc rdisp32
 603       if (which == end_pc_operand)  return ip + 4;
 604       assert(which == call32_operand, "jcc has no disp32 or imm");
 605       return ip;
 606     default:
 607       ShouldNotReachHere();
 608     }
 609     break;
 610 
 611   case 0x81: // addl a, #32; addl r, #32
 612     // also: orl, adcl, sbbl, andl, subl, xorl, cmpl
 613     // on 32bit in the case of cmpl, the imm might be an oop
 614     tail_size = 4;
 615     debug_only(has_disp32 = true); // has both kinds of operands!
 616     break;
 617 
 618   case 0x83: // addl a, #8; addl r, #8
 619     // also: orl, adcl, sbbl, andl, subl, xorl, cmpl
 620     debug_only(has_disp32 = true); // has both kinds of operands!
 621     tail_size = 1;
 622     break;
 623 
 624   case 0x9B:
 625     switch (0xFF & *ip++) {
 626     case 0xD9: // fnstcw a
 627       debug_only(has_disp32 = true);
 628       break;
 629     default:
 630       ShouldNotReachHere();
 631     }
 632     break;
 633 
 634   case REP4(0x00): // addb a, r; addl a, r; addb r, a; addl r, a
 635   case REP4(0x10): // adc...
 636   case REP4(0x20): // and...
 637   case REP4(0x30): // xor...
 638   case REP4(0x08): // or...
 639   case REP4(0x18): // sbb...
 640   case REP4(0x28): // sub...
 641   case 0xF7: // mull a
 642   case 0x8D: // lea r, a
 643   case 0x87: // xchg r, a
 644   case REP4(0x38): // cmp...
 645   case 0x85: // test r, a
 646     debug_only(has_disp32 = true); // has both kinds of operands!
 647     break;
 648 
 649   case 0xC1: // sal a, #8; sar a, #8; shl a, #8; shr a, #8
 650   case 0xC6: // movb a, #8
 651   case 0x80: // cmpb a, #8
 652   case 0x6B: // imul r, a, #8
 653     debug_only(has_disp32 = true); // has both kinds of operands!
 654     tail_size = 1; // the imm8
 655     break;
 656 
 657   case 0xC4: // VEX_3bytes
 658   case 0xC5: // VEX_2bytes
 659     assert((UseAVX > 0), "shouldn't have VEX prefix");
 660     assert(ip == inst+1, "no prefixes allowed");
 661     // C4 and C5 are also used as opcodes for PINSRW and PEXTRW instructions
 662     // but they have prefix 0x0F and processed when 0x0F processed above.
 663     //
 664     // In 32-bit mode the VEX first byte C4 and C5 alias onto LDS and LES
 665     // instructions (these instructions are not supported in 64-bit mode).
 666     // To distinguish them bits [7:6] are set in the VEX second byte since
 667     // ModRM byte can not be of the form 11xxxxxx in 32-bit mode. To set
 668     // those VEX bits REX and vvvv bits are inverted.
 669     //
 670     // Fortunately C2 doesn't generate these instructions so we don't need
 671     // to check for them in product version.
 672 
 673     // Check second byte
 674     NOT_LP64(assert((0xC0 & *ip) == 0xC0, "shouldn't have LDS and LES instructions"));
 675 
 676     // First byte
 677     if ((0xFF & *inst) == VEX_3bytes) {
 678       ip++; // third byte
 679       is_64bit = ((VEX_W & *ip) == VEX_W);
 680     }
 681     ip++; // opcode
 682     // To find the end of instruction (which == end_pc_operand).
 683     switch (0xFF & *ip) {
 684     case 0x61: // pcmpestri r, r/a, #8
 685     case 0x70: // pshufd r, r/a, #8
 686     case 0x73: // psrldq r, #8
 687       tail_size = 1;  // the imm8
 688       break;
 689     default:
 690       break;
 691     }
 692     ip++; // skip opcode
 693     debug_only(has_disp32 = true); // has both kinds of operands!
 694     break;
 695 
 696   case 0xD1: // sal a, 1; sar a, 1; shl a, 1; shr a, 1
 697   case 0xD3: // sal a, %cl; sar a, %cl; shl a, %cl; shr a, %cl
 698   case 0xD9: // fld_s a; fst_s a; fstp_s a; fldcw a
 699   case 0xDD: // fld_d a; fst_d a; fstp_d a
 700   case 0xDB: // fild_s a; fistp_s a; fld_x a; fstp_x a
 701   case 0xDF: // fild_d a; fistp_d a
 702   case 0xD8: // fadd_s a; fsubr_s a; fmul_s a; fdivr_s a; fcomp_s a
 703   case 0xDC: // fadd_d a; fsubr_d a; fmul_d a; fdivr_d a; fcomp_d a
 704   case 0xDE: // faddp_d a; fsubrp_d a; fmulp_d a; fdivrp_d a; fcompp_d a
 705     debug_only(has_disp32 = true);
 706     break;
 707 
 708   case 0xE8: // call rdisp32
 709   case 0xE9: // jmp  rdisp32
 710     if (which == end_pc_operand)  return ip + 4;
 711     assert(which == call32_operand, "call has no disp32 or imm");
 712     return ip;
 713 
 714   case 0xF0:                    // Lock
 715     assert(os::is_MP(), "only on MP");
 716     goto again_after_prefix;
 717 
 718   case 0xF3:                    // For SSE
 719   case 0xF2:                    // For SSE2
 720     switch (0xFF & *ip++) {
 721     case REX:
 722     case REX_B:
 723     case REX_X:
 724     case REX_XB:
 725     case REX_R:
 726     case REX_RB:
 727     case REX_RX:
 728     case REX_RXB:
 729     case REX_W:
 730     case REX_WB:
 731     case REX_WX:
 732     case REX_WXB:
 733     case REX_WR:
 734     case REX_WRB:
 735     case REX_WRX:
 736     case REX_WRXB:
 737       NOT_LP64(assert(false, "found 64bit prefix"));
 738       ip++;
 739     default:
 740       ip++;
 741     }
 742     debug_only(has_disp32 = true); // has both kinds of operands!
 743     break;
 744 
 745   default:
 746     ShouldNotReachHere();
 747 
 748 #undef REP8
 749 #undef REP16
 750   }
 751 
 752   assert(which != call32_operand, "instruction is not a call, jmp, or jcc");
 753 #ifdef _LP64
 754   assert(which != imm_operand, "instruction is not a movq reg, imm64");
 755 #else
 756   // assert(which != imm_operand || has_imm32, "instruction has no imm32 field");
 757   assert(which != imm_operand || has_disp32, "instruction has no imm32 field");
 758 #endif // LP64
 759   assert(which != disp32_operand || has_disp32, "instruction has no disp32 field");
 760 
 761   // parse the output of emit_operand
 762   int op2 = 0xFF & *ip++;
 763   int base = op2 & 0x07;
 764   int op3 = -1;
 765   const int b100 = 4;
 766   const int b101 = 5;
 767   if (base == b100 && (op2 >> 6) != 3) {
 768     op3 = 0xFF & *ip++;
 769     base = op3 & 0x07;   // refetch the base
 770   }
 771   // now ip points at the disp (if any)
 772 
 773   switch (op2 >> 6) {
 774   case 0:
 775     // [00 reg  100][ss index base]
 776     // [00 reg  100][00   100  esp]
 777     // [00 reg base]
 778     // [00 reg  100][ss index  101][disp32]
 779     // [00 reg  101]               [disp32]
 780 
 781     if (base == b101) {
 782       if (which == disp32_operand)
 783         return ip;              // caller wants the disp32
 784       ip += 4;                  // skip the disp32
 785     }
 786     break;
 787 
 788   case 1:
 789     // [01 reg  100][ss index base][disp8]
 790     // [01 reg  100][00   100  esp][disp8]
 791     // [01 reg base]               [disp8]
 792     ip += 1;                    // skip the disp8
 793     break;
 794 
 795   case 2:
 796     // [10 reg  100][ss index base][disp32]
 797     // [10 reg  100][00   100  esp][disp32]
 798     // [10 reg base]               [disp32]
 799     if (which == disp32_operand)
 800       return ip;                // caller wants the disp32
 801     ip += 4;                    // skip the disp32
 802     break;
 803 
 804   case 3:
 805     // [11 reg base]  (not a memory addressing mode)
 806     break;
 807   }
 808 
 809   if (which == end_pc_operand) {
 810     return ip + tail_size;
 811   }
 812 
 813 #ifdef _LP64
 814   assert(which == narrow_oop_operand && !is_64bit, "instruction is not a movl adr, imm32");
 815 #else
 816   assert(which == imm_operand, "instruction has only an imm field");
 817 #endif // LP64
 818   return ip;
 819 }
 820 
 821 address Assembler::locate_next_instruction(address inst) {
 822   // Secretly share code with locate_operand:
 823   return locate_operand(inst, end_pc_operand);
 824 }
 825 
 826 
 827 #ifdef ASSERT
 828 void Assembler::check_relocation(RelocationHolder const& rspec, int format) {
 829   address inst = inst_mark();
 830   assert(inst != NULL && inst < pc(), "must point to beginning of instruction");
 831   address opnd;
 832 
 833   Relocation* r = rspec.reloc();
 834   if (r->type() == relocInfo::none) {
 835     return;
 836   } else if (r->is_call() || format == call32_operand) {
 837     // assert(format == imm32_operand, "cannot specify a nonzero format");
 838     opnd = locate_operand(inst, call32_operand);
 839   } else if (r->is_data()) {
 840     assert(format == imm_operand || format == disp32_operand
 841            LP64_ONLY(|| format == narrow_oop_operand), "format ok");
 842     opnd = locate_operand(inst, (WhichOperand)format);
 843   } else {
 844     assert(format == imm_operand, "cannot specify a format");
 845     return;
 846   }
 847   assert(opnd == pc(), "must put operand where relocs can find it");
 848 }
 849 #endif // ASSERT
 850 
 851 void Assembler::emit_operand32(Register reg, Address adr) {
 852   assert(reg->encoding() < 8, "no extended registers");
 853   assert(!adr.base_needs_rex() && !adr.index_needs_rex(), "no extended registers");
 854   emit_operand(reg, adr._base, adr._index, adr._scale, adr._disp,
 855                adr._rspec);
 856 }
 857 
 858 void Assembler::emit_operand(Register reg, Address adr,
 859                              int rip_relative_correction) {
 860   emit_operand(reg, adr._base, adr._index, adr._scale, adr._disp,
 861                adr._rspec,
 862                rip_relative_correction);
 863 }
 864 
 865 void Assembler::emit_operand(XMMRegister reg, Address adr) {
 866   emit_operand(reg, adr._base, adr._index, adr._scale, adr._disp,
 867                adr._rspec);
 868 }
 869 
 870 // MMX operations
 871 void Assembler::emit_operand(MMXRegister reg, Address adr) {
 872   assert(!adr.base_needs_rex() && !adr.index_needs_rex(), "no extended registers");
 873   emit_operand((Register)reg, adr._base, adr._index, adr._scale, adr._disp, adr._rspec);
 874 }
 875 
 876 // work around gcc (3.2.1-7a) bug
 877 void Assembler::emit_operand(Address adr, MMXRegister reg) {
 878   assert(!adr.base_needs_rex() && !adr.index_needs_rex(), "no extended registers");
 879   emit_operand((Register)reg, adr._base, adr._index, adr._scale, adr._disp, adr._rspec);
 880 }
 881 
 882 
 883 void Assembler::emit_farith(int b1, int b2, int i) {
 884   assert(isByte(b1) && isByte(b2), "wrong opcode");
 885   assert(0 <= i &&  i < 8, "illegal stack offset");
 886   emit_byte(b1);
 887   emit_byte(b2 + i);
 888 }
 889 
 890 
 891 // Now the Assembler instructions (identical for 32/64 bits)
 892 
 893 void Assembler::adcl(Address dst, int32_t imm32) {
 894   InstructionMark im(this);
 895   prefix(dst);
 896   emit_arith_operand(0x81, rdx, dst, imm32);
 897 }
 898 
 899 void Assembler::adcl(Address dst, Register src) {
 900   InstructionMark im(this);
 901   prefix(dst, src);
 902   emit_byte(0x11);
 903   emit_operand(src, dst);
 904 }
 905 
 906 void Assembler::adcl(Register dst, int32_t imm32) {
 907   prefix(dst);
 908   emit_arith(0x81, 0xD0, dst, imm32);
 909 }
 910 
 911 void Assembler::adcl(Register dst, Address src) {
 912   InstructionMark im(this);
 913   prefix(src, dst);
 914   emit_byte(0x13);
 915   emit_operand(dst, src);
 916 }
 917 
 918 void Assembler::adcl(Register dst, Register src) {
 919   (void) prefix_and_encode(dst->encoding(), src->encoding());
 920   emit_arith(0x13, 0xC0, dst, src);
 921 }
 922 
 923 void Assembler::addl(Address dst, int32_t imm32) {
 924   InstructionMark im(this);
 925   prefix(dst);
 926   emit_arith_operand(0x81, rax, dst, imm32);
 927 }
 928 
 929 void Assembler::addl(Address dst, Register src) {
 930   InstructionMark im(this);
 931   prefix(dst, src);
 932   emit_byte(0x01);
 933   emit_operand(src, dst);
 934 }
 935 
 936 void Assembler::addl(Register dst, int32_t imm32) {
 937   prefix(dst);
 938   emit_arith(0x81, 0xC0, dst, imm32);
 939 }
 940 
 941 void Assembler::addl(Register dst, Address src) {
 942   InstructionMark im(this);
 943   prefix(src, dst);
 944   emit_byte(0x03);
 945   emit_operand(dst, src);
 946 }
 947 
 948 void Assembler::addl(Register dst, Register src) {
 949   (void) prefix_and_encode(dst->encoding(), src->encoding());
 950   emit_arith(0x03, 0xC0, dst, src);
 951 }
 952 
 953 void Assembler::addr_nop_4() {
 954   assert(UseAddressNop, "no CPU support");
 955   // 4 bytes: NOP DWORD PTR [EAX+0]
 956   emit_byte(0x0F);
 957   emit_byte(0x1F);
 958   emit_byte(0x40); // emit_rm(cbuf, 0x1, EAX_enc, EAX_enc);
 959   emit_byte(0);    // 8-bits offset (1 byte)
 960 }
 961 
 962 void Assembler::addr_nop_5() {
 963   assert(UseAddressNop, "no CPU support");
 964   // 5 bytes: NOP DWORD PTR [EAX+EAX*0+0] 8-bits offset
 965   emit_byte(0x0F);
 966   emit_byte(0x1F);
 967   emit_byte(0x44); // emit_rm(cbuf, 0x1, EAX_enc, 0x4);
 968   emit_byte(0x00); // emit_rm(cbuf, 0x0, EAX_enc, EAX_enc);
 969   emit_byte(0);    // 8-bits offset (1 byte)
 970 }
 971 
 972 void Assembler::addr_nop_7() {
 973   assert(UseAddressNop, "no CPU support");
 974   // 7 bytes: NOP DWORD PTR [EAX+0] 32-bits offset
 975   emit_byte(0x0F);
 976   emit_byte(0x1F);
 977   emit_byte(0x80); // emit_rm(cbuf, 0x2, EAX_enc, EAX_enc);
 978   emit_long(0);    // 32-bits offset (4 bytes)
 979 }
 980 
 981 void Assembler::addr_nop_8() {
 982   assert(UseAddressNop, "no CPU support");
 983   // 8 bytes: NOP DWORD PTR [EAX+EAX*0+0] 32-bits offset
 984   emit_byte(0x0F);
 985   emit_byte(0x1F);
 986   emit_byte(0x84); // emit_rm(cbuf, 0x2, EAX_enc, 0x4);
 987   emit_byte(0x00); // emit_rm(cbuf, 0x0, EAX_enc, EAX_enc);
 988   emit_long(0);    // 32-bits offset (4 bytes)
 989 }
 990 
 991 void Assembler::addsd(XMMRegister dst, XMMRegister src) {
 992   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
 993   emit_simd_arith(0x58, dst, src, VEX_SIMD_F2);
 994 }
 995 
 996 void Assembler::addsd(XMMRegister dst, Address src) {
 997   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
 998   emit_simd_arith(0x58, dst, src, VEX_SIMD_F2);
 999 }
1000 
1001 void Assembler::addss(XMMRegister dst, XMMRegister src) {
1002   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1003   emit_simd_arith(0x58, dst, src, VEX_SIMD_F3);
1004 }
1005 
1006 void Assembler::addss(XMMRegister dst, Address src) {
1007   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1008   emit_simd_arith(0x58, dst, src, VEX_SIMD_F3);
1009 }
1010 
1011 void Assembler::andl(Address dst, int32_t imm32) {
1012   InstructionMark im(this);
1013   prefix(dst);
1014   emit_byte(0x81);
1015   emit_operand(rsp, dst, 4);
1016   emit_long(imm32);
1017 }
1018 
1019 void Assembler::andl(Register dst, int32_t imm32) {
1020   prefix(dst);
1021   emit_arith(0x81, 0xE0, dst, imm32);
1022 }
1023 
1024 void Assembler::andl(Register dst, Address src) {
1025   InstructionMark im(this);
1026   prefix(src, dst);
1027   emit_byte(0x23);
1028   emit_operand(dst, src);
1029 }
1030 
1031 void Assembler::andl(Register dst, Register src) {
1032   (void) prefix_and_encode(dst->encoding(), src->encoding());
1033   emit_arith(0x23, 0xC0, dst, src);
1034 }
1035 
1036 void Assembler::bsfl(Register dst, Register src) {
1037   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1038   emit_byte(0x0F);
1039   emit_byte(0xBC);
1040   emit_byte(0xC0 | encode);
1041 }
1042 
1043 void Assembler::bsrl(Register dst, Register src) {
1044   assert(!VM_Version::supports_lzcnt(), "encoding is treated as LZCNT");
1045   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1046   emit_byte(0x0F);
1047   emit_byte(0xBD);
1048   emit_byte(0xC0 | encode);
1049 }
1050 
1051 void Assembler::bswapl(Register reg) { // bswap
1052   int encode = prefix_and_encode(reg->encoding());
1053   emit_byte(0x0F);
1054   emit_byte(0xC8 | encode);
1055 }
1056 
1057 void Assembler::call(Label& L, relocInfo::relocType rtype) {
1058   // suspect disp32 is always good
1059   int operand = LP64_ONLY(disp32_operand) NOT_LP64(imm_operand);
1060 
1061   if (L.is_bound()) {
1062     const int long_size = 5;
1063     int offs = (int)( target(L) - pc() );
1064     assert(offs <= 0, "assembler error");
1065     InstructionMark im(this);
1066     // 1110 1000 #32-bit disp
1067     emit_byte(0xE8);
1068     emit_data(offs - long_size, rtype, operand);
1069   } else {
1070     InstructionMark im(this);
1071     // 1110 1000 #32-bit disp
1072     L.add_patch_at(code(), locator());
1073 
1074     emit_byte(0xE8);
1075     emit_data(int(0), rtype, operand);
1076   }
1077 }
1078 
1079 void Assembler::call(Register dst) {
1080   int encode = prefix_and_encode(dst->encoding());
1081   emit_byte(0xFF);
1082   emit_byte(0xD0 | encode);
1083 }
1084 
1085 
1086 void Assembler::call(Address adr) {
1087   InstructionMark im(this);
1088   prefix(adr);
1089   emit_byte(0xFF);
1090   emit_operand(rdx, adr);
1091 }
1092 
1093 void Assembler::call_literal(address entry, RelocationHolder const& rspec) {
1094   assert(entry != NULL, "call most probably wrong");
1095   InstructionMark im(this);
1096   emit_byte(0xE8);
1097   intptr_t disp = entry - (_code_pos + sizeof(int32_t));
1098   assert(is_simm32(disp), "must be 32bit offset (call2)");
1099   // Technically, should use call32_operand, but this format is
1100   // implied by the fact that we're emitting a call instruction.
1101 
1102   int operand = LP64_ONLY(disp32_operand) NOT_LP64(call32_operand);
1103   emit_data((int) disp, rspec, operand);
1104 }
1105 
1106 void Assembler::cdql() {
1107   emit_byte(0x99);
1108 }
1109 
1110 void Assembler::cmovl(Condition cc, Register dst, Register src) {
1111   NOT_LP64(guarantee(VM_Version::supports_cmov(), "illegal instruction"));
1112   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1113   emit_byte(0x0F);
1114   emit_byte(0x40 | cc);
1115   emit_byte(0xC0 | encode);
1116 }
1117 
1118 
1119 void Assembler::cmovl(Condition cc, Register dst, Address src) {
1120   NOT_LP64(guarantee(VM_Version::supports_cmov(), "illegal instruction"));
1121   prefix(src, dst);
1122   emit_byte(0x0F);
1123   emit_byte(0x40 | cc);
1124   emit_operand(dst, src);
1125 }
1126 
1127 void Assembler::cmpb(Address dst, int imm8) {
1128   InstructionMark im(this);
1129   prefix(dst);
1130   emit_byte(0x80);
1131   emit_operand(rdi, dst, 1);
1132   emit_byte(imm8);
1133 }
1134 
1135 void Assembler::cmpl(Address dst, int32_t imm32) {
1136   InstructionMark im(this);
1137   prefix(dst);
1138   emit_byte(0x81);
1139   emit_operand(rdi, dst, 4);
1140   emit_long(imm32);
1141 }
1142 
1143 void Assembler::cmpl(Register dst, int32_t imm32) {
1144   prefix(dst);
1145   emit_arith(0x81, 0xF8, dst, imm32);
1146 }
1147 
1148 void Assembler::cmpl(Register dst, Register src) {
1149   (void) prefix_and_encode(dst->encoding(), src->encoding());
1150   emit_arith(0x3B, 0xC0, dst, src);
1151 }
1152 
1153 
1154 void Assembler::cmpl(Register dst, Address  src) {
1155   InstructionMark im(this);
1156   prefix(src, dst);
1157   emit_byte(0x3B);
1158   emit_operand(dst, src);
1159 }
1160 
1161 void Assembler::cmpw(Address dst, int imm16) {
1162   InstructionMark im(this);
1163   assert(!dst.base_needs_rex() && !dst.index_needs_rex(), "no extended registers");
1164   emit_byte(0x66);
1165   emit_byte(0x81);
1166   emit_operand(rdi, dst, 2);
1167   emit_word(imm16);
1168 }
1169 
1170 // The 32-bit cmpxchg compares the value at adr with the contents of rax,
1171 // and stores reg into adr if so; otherwise, the value at adr is loaded into rax,.
1172 // The ZF is set if the compared values were equal, and cleared otherwise.
1173 void Assembler::cmpxchgl(Register reg, Address adr) { // cmpxchg
1174   if (Atomics & 2) {
1175      // caveat: no instructionmark, so this isn't relocatable.
1176      // Emit a synthetic, non-atomic, CAS equivalent.
1177      // Beware.  The synthetic form sets all ICCs, not just ZF.
1178      // cmpxchg r,[m] is equivalent to rax, = CAS (m, rax, r)
1179      cmpl(rax, adr);
1180      movl(rax, adr);
1181      if (reg != rax) {
1182         Label L ;
1183         jcc(Assembler::notEqual, L);
1184         movl(adr, reg);
1185         bind(L);
1186      }
1187   } else {
1188      InstructionMark im(this);
1189      prefix(adr, reg);
1190      emit_byte(0x0F);
1191      emit_byte(0xB1);
1192      emit_operand(reg, adr);
1193   }
1194 }
1195 
1196 void Assembler::comisd(XMMRegister dst, Address src) {
1197   // NOTE: dbx seems to decode this as comiss even though the
1198   // 0x66 is there. Strangly ucomisd comes out correct
1199   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1200   emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66);
1201 }
1202 
1203 void Assembler::comisd(XMMRegister dst, XMMRegister src) {
1204   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1205   emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_66);
1206 }
1207 
1208 void Assembler::comiss(XMMRegister dst, Address src) {
1209   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1210   emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE);
1211 }
1212 
1213 void Assembler::comiss(XMMRegister dst, XMMRegister src) {
1214   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1215   emit_simd_arith_nonds(0x2F, dst, src, VEX_SIMD_NONE);
1216 }
1217 
1218 void Assembler::cvtdq2pd(XMMRegister dst, XMMRegister src) {
1219   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1220   emit_simd_arith_nonds(0xE6, dst, src, VEX_SIMD_F3);
1221 }
1222 
1223 void Assembler::cvtdq2ps(XMMRegister dst, XMMRegister src) {
1224   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1225   emit_simd_arith_nonds(0x5B, dst, src, VEX_SIMD_NONE);
1226 }
1227 
1228 void Assembler::cvtsd2ss(XMMRegister dst, XMMRegister src) {
1229   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1230   emit_simd_arith(0x5A, dst, src, VEX_SIMD_F2);
1231 }
1232 
1233 void Assembler::cvtsd2ss(XMMRegister dst, Address src) {
1234   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1235   emit_simd_arith(0x5A, dst, src, VEX_SIMD_F2);
1236 }
1237 
1238 void Assembler::cvtsi2sdl(XMMRegister dst, Register src) {
1239   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1240   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2);
1241   emit_byte(0x2A);
1242   emit_byte(0xC0 | encode);
1243 }
1244 
1245 void Assembler::cvtsi2sdl(XMMRegister dst, Address src) {
1246   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1247   emit_simd_arith(0x2A, dst, src, VEX_SIMD_F2);
1248 }
1249 
1250 void Assembler::cvtsi2ssl(XMMRegister dst, Register src) {
1251   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1252   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3);
1253   emit_byte(0x2A);
1254   emit_byte(0xC0 | encode);
1255 }
1256 
1257 void Assembler::cvtsi2ssl(XMMRegister dst, Address src) {
1258   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1259   emit_simd_arith(0x2A, dst, src, VEX_SIMD_F3);
1260 }
1261 
1262 void Assembler::cvtss2sd(XMMRegister dst, XMMRegister src) {
1263   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1264   emit_simd_arith(0x5A, dst, src, VEX_SIMD_F3);
1265 }
1266 
1267 void Assembler::cvtss2sd(XMMRegister dst, Address src) {
1268   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1269   emit_simd_arith(0x5A, dst, src, VEX_SIMD_F3);
1270 }
1271 
1272 
1273 void Assembler::cvttsd2sil(Register dst, XMMRegister src) {
1274   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1275   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F2);
1276   emit_byte(0x2C);
1277   emit_byte(0xC0 | encode);
1278 }
1279 
1280 void Assembler::cvttss2sil(Register dst, XMMRegister src) {
1281   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1282   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_F3);
1283   emit_byte(0x2C);
1284   emit_byte(0xC0 | encode);
1285 }
1286 
1287 void Assembler::decl(Address dst) {
1288   // Don't use it directly. Use MacroAssembler::decrement() instead.
1289   InstructionMark im(this);
1290   prefix(dst);
1291   emit_byte(0xFF);
1292   emit_operand(rcx, dst);
1293 }
1294 
1295 void Assembler::divsd(XMMRegister dst, Address src) {
1296   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1297   emit_simd_arith(0x5E, dst, src, VEX_SIMD_F2);
1298 }
1299 
1300 void Assembler::divsd(XMMRegister dst, XMMRegister src) {
1301   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1302   emit_simd_arith(0x5E, dst, src, VEX_SIMD_F2);
1303 }
1304 
1305 void Assembler::divss(XMMRegister dst, Address src) {
1306   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1307   emit_simd_arith(0x5E, dst, src, VEX_SIMD_F3);
1308 }
1309 
1310 void Assembler::divss(XMMRegister dst, XMMRegister src) {
1311   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1312   emit_simd_arith(0x5E, dst, src, VEX_SIMD_F3);
1313 }
1314 
1315 void Assembler::emms() {
1316   NOT_LP64(assert(VM_Version::supports_mmx(), ""));
1317   emit_byte(0x0F);
1318   emit_byte(0x77);
1319 }
1320 
1321 void Assembler::hlt() {
1322   emit_byte(0xF4);
1323 }
1324 
1325 void Assembler::idivl(Register src) {
1326   int encode = prefix_and_encode(src->encoding());
1327   emit_byte(0xF7);
1328   emit_byte(0xF8 | encode);
1329 }
1330 
1331 void Assembler::divl(Register src) { // Unsigned
1332   int encode = prefix_and_encode(src->encoding());
1333   emit_byte(0xF7);
1334   emit_byte(0xF0 | encode);
1335 }
1336 
1337 void Assembler::imull(Register dst, Register src) {
1338   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1339   emit_byte(0x0F);
1340   emit_byte(0xAF);
1341   emit_byte(0xC0 | encode);
1342 }
1343 
1344 
1345 void Assembler::imull(Register dst, Register src, int value) {
1346   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1347   if (is8bit(value)) {
1348     emit_byte(0x6B);
1349     emit_byte(0xC0 | encode);
1350     emit_byte(value & 0xFF);
1351   } else {
1352     emit_byte(0x69);
1353     emit_byte(0xC0 | encode);
1354     emit_long(value);
1355   }
1356 }
1357 
1358 void Assembler::incl(Address dst) {
1359   // Don't use it directly. Use MacroAssembler::increment() instead.
1360   InstructionMark im(this);
1361   prefix(dst);
1362   emit_byte(0xFF);
1363   emit_operand(rax, dst);
1364 }
1365 
1366 void Assembler::jcc(Condition cc, Label& L, bool maybe_short) {
1367   InstructionMark im(this);
1368   assert((0 <= cc) && (cc < 16), "illegal cc");
1369   if (L.is_bound()) {
1370     address dst = target(L);
1371     assert(dst != NULL, "jcc most probably wrong");
1372 
1373     const int short_size = 2;
1374     const int long_size = 6;
1375     intptr_t offs = (intptr_t)dst - (intptr_t)_code_pos;
1376     if (maybe_short && is8bit(offs - short_size)) {
1377       // 0111 tttn #8-bit disp
1378       emit_byte(0x70 | cc);
1379       emit_byte((offs - short_size) & 0xFF);
1380     } else {
1381       // 0000 1111 1000 tttn #32-bit disp
1382       assert(is_simm32(offs - long_size),
1383              "must be 32bit offset (call4)");
1384       emit_byte(0x0F);
1385       emit_byte(0x80 | cc);
1386       emit_long(offs - long_size);
1387     }
1388   } else {
1389     // Note: could eliminate cond. jumps to this jump if condition
1390     //       is the same however, seems to be rather unlikely case.
1391     // Note: use jccb() if label to be bound is very close to get
1392     //       an 8-bit displacement
1393     L.add_patch_at(code(), locator());
1394     emit_byte(0x0F);
1395     emit_byte(0x80 | cc);
1396     emit_long(0);
1397   }
1398 }
1399 
1400 void Assembler::jccb(Condition cc, Label& L) {
1401   if (L.is_bound()) {
1402     const int short_size = 2;
1403     address entry = target(L);
1404 #ifdef ASSERT
1405     intptr_t dist = (intptr_t)entry - ((intptr_t)_code_pos + short_size);
1406     intptr_t delta = short_branch_delta();
1407     if (delta != 0) {
1408       dist += (dist < 0 ? (-delta) :delta);
1409     }
1410     assert(is8bit(dist), "Dispacement too large for a short jmp");
1411 #endif
1412     intptr_t offs = (intptr_t)entry - (intptr_t)_code_pos;
1413     // 0111 tttn #8-bit disp
1414     emit_byte(0x70 | cc);
1415     emit_byte((offs - short_size) & 0xFF);
1416   } else {
1417     InstructionMark im(this);
1418     L.add_patch_at(code(), locator());
1419     emit_byte(0x70 | cc);
1420     emit_byte(0);
1421   }
1422 }
1423 
1424 void Assembler::jmp(Address adr) {
1425   InstructionMark im(this);
1426   prefix(adr);
1427   emit_byte(0xFF);
1428   emit_operand(rsp, adr);
1429 }
1430 
1431 void Assembler::jmp(Label& L, bool maybe_short) {
1432   if (L.is_bound()) {
1433     address entry = target(L);
1434     assert(entry != NULL, "jmp most probably wrong");
1435     InstructionMark im(this);
1436     const int short_size = 2;
1437     const int long_size = 5;
1438     intptr_t offs = entry - _code_pos;
1439     if (maybe_short && is8bit(offs - short_size)) {
1440       emit_byte(0xEB);
1441       emit_byte((offs - short_size) & 0xFF);
1442     } else {
1443       emit_byte(0xE9);
1444       emit_long(offs - long_size);
1445     }
1446   } else {
1447     // By default, forward jumps are always 32-bit displacements, since
1448     // we can't yet know where the label will be bound.  If you're sure that
1449     // the forward jump will not run beyond 256 bytes, use jmpb to
1450     // force an 8-bit displacement.
1451     InstructionMark im(this);
1452     L.add_patch_at(code(), locator());
1453     emit_byte(0xE9);
1454     emit_long(0);
1455   }
1456 }
1457 
1458 void Assembler::jmp(Register entry) {
1459   int encode = prefix_and_encode(entry->encoding());
1460   emit_byte(0xFF);
1461   emit_byte(0xE0 | encode);
1462 }
1463 
1464 void Assembler::jmp_literal(address dest, RelocationHolder const& rspec) {
1465   InstructionMark im(this);
1466   emit_byte(0xE9);
1467   assert(dest != NULL, "must have a target");
1468   intptr_t disp = dest - (_code_pos + sizeof(int32_t));
1469   assert(is_simm32(disp), "must be 32bit offset (jmp)");
1470   emit_data(disp, rspec.reloc(), call32_operand);
1471 }
1472 
1473 void Assembler::jmpb(Label& L) {
1474   if (L.is_bound()) {
1475     const int short_size = 2;
1476     address entry = target(L);
1477     assert(entry != NULL, "jmp most probably wrong");
1478 #ifdef ASSERT
1479     intptr_t dist = (intptr_t)entry - ((intptr_t)_code_pos + short_size);
1480     intptr_t delta = short_branch_delta();
1481     if (delta != 0) {
1482       dist += (dist < 0 ? (-delta) :delta);
1483     }
1484     assert(is8bit(dist), "Dispacement too large for a short jmp");
1485 #endif
1486     intptr_t offs = entry - _code_pos;
1487     emit_byte(0xEB);
1488     emit_byte((offs - short_size) & 0xFF);
1489   } else {
1490     InstructionMark im(this);
1491     L.add_patch_at(code(), locator());
1492     emit_byte(0xEB);
1493     emit_byte(0);
1494   }
1495 }
1496 
1497 void Assembler::ldmxcsr( Address src) {
1498   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1499   InstructionMark im(this);
1500   prefix(src);
1501   emit_byte(0x0F);
1502   emit_byte(0xAE);
1503   emit_operand(as_Register(2), src);
1504 }
1505 
1506 void Assembler::leal(Register dst, Address src) {
1507   InstructionMark im(this);
1508 #ifdef _LP64
1509   emit_byte(0x67); // addr32
1510   prefix(src, dst);
1511 #endif // LP64
1512   emit_byte(0x8D);
1513   emit_operand(dst, src);
1514 }
1515 
1516 void Assembler::lock() {
1517   if (Atomics & 1) {
1518      // Emit either nothing, a NOP, or a NOP: prefix
1519      emit_byte(0x90) ;
1520   } else {
1521      emit_byte(0xF0);
1522   }
1523 }
1524 
1525 void Assembler::lzcntl(Register dst, Register src) {
1526   assert(VM_Version::supports_lzcnt(), "encoding is treated as BSR");
1527   emit_byte(0xF3);
1528   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1529   emit_byte(0x0F);
1530   emit_byte(0xBD);
1531   emit_byte(0xC0 | encode);
1532 }
1533 
1534 // Emit mfence instruction
1535 void Assembler::mfence() {
1536   NOT_LP64(assert(VM_Version::supports_sse2(), "unsupported");)
1537   emit_byte( 0x0F );
1538   emit_byte( 0xAE );
1539   emit_byte( 0xF0 );
1540 }
1541 
1542 void Assembler::mov(Register dst, Register src) {
1543   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
1544 }
1545 
1546 void Assembler::movapd(XMMRegister dst, XMMRegister src) {
1547   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1548   emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_66);
1549 }
1550 
1551 void Assembler::movaps(XMMRegister dst, XMMRegister src) {
1552   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1553   emit_simd_arith_nonds(0x28, dst, src, VEX_SIMD_NONE);
1554 }
1555 
1556 void Assembler::movlhps(XMMRegister dst, XMMRegister src) {
1557   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1558   int encode = simd_prefix_and_encode(dst, src, src, VEX_SIMD_NONE);
1559   emit_byte(0x16);
1560   emit_byte(0xC0 | encode);
1561 }
1562 
1563 void Assembler::movb(Register dst, Address src) {
1564   NOT_LP64(assert(dst->has_byte_register(), "must have byte register"));
1565   InstructionMark im(this);
1566   prefix(src, dst, true);
1567   emit_byte(0x8A);
1568   emit_operand(dst, src);
1569 }
1570 
1571 
1572 void Assembler::movb(Address dst, int imm8) {
1573   InstructionMark im(this);
1574    prefix(dst);
1575   emit_byte(0xC6);
1576   emit_operand(rax, dst, 1);
1577   emit_byte(imm8);
1578 }
1579 
1580 
1581 void Assembler::movb(Address dst, Register src) {
1582   assert(src->has_byte_register(), "must have byte register");
1583   InstructionMark im(this);
1584   prefix(dst, src, true);
1585   emit_byte(0x88);
1586   emit_operand(src, dst);
1587 }
1588 
1589 void Assembler::movdl(XMMRegister dst, Register src) {
1590   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1591   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
1592   emit_byte(0x6E);
1593   emit_byte(0xC0 | encode);
1594 }
1595 
1596 void Assembler::movdl(Register dst, XMMRegister src) {
1597   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1598   // swap src/dst to get correct prefix
1599   int encode = simd_prefix_and_encode(src, dst, VEX_SIMD_66);
1600   emit_byte(0x7E);
1601   emit_byte(0xC0 | encode);
1602 }
1603 
1604 void Assembler::movdl(XMMRegister dst, Address src) {
1605   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1606   InstructionMark im(this);
1607   simd_prefix(dst, src, VEX_SIMD_66);
1608   emit_byte(0x6E);
1609   emit_operand(dst, src);
1610 }
1611 
1612 void Assembler::movdl(Address dst, XMMRegister src) {
1613   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1614   InstructionMark im(this);
1615   simd_prefix(dst, src, VEX_SIMD_66);
1616   emit_byte(0x7E);
1617   emit_operand(src, dst);
1618 }
1619 
1620 void Assembler::movdqa(XMMRegister dst, XMMRegister src) {
1621   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1622   emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_66);
1623 }
1624 
1625 void Assembler::movdqu(XMMRegister dst, Address src) {
1626   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1627   emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_F3);
1628 }
1629 
1630 void Assembler::movdqu(XMMRegister dst, XMMRegister src) {
1631   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1632   emit_simd_arith_nonds(0x6F, dst, src, VEX_SIMD_F3);
1633 }
1634 
1635 void Assembler::movdqu(Address dst, XMMRegister src) {
1636   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1637   InstructionMark im(this);
1638   simd_prefix(dst, src, VEX_SIMD_F3);
1639   emit_byte(0x7F);
1640   emit_operand(src, dst);
1641 }
1642 
1643 // Move Unaligned 256bit Vector
1644 void Assembler::vmovdqu(XMMRegister dst, XMMRegister src) {
1645   assert(UseAVX, "");
1646   bool vector256 = true;
1647   int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F3, vector256);
1648   emit_byte(0x6F);
1649   emit_byte(0xC0 | encode);
1650 }
1651 
1652 void Assembler::vmovdqu(XMMRegister dst, Address src) {
1653   assert(UseAVX, "");
1654   InstructionMark im(this);
1655   bool vector256 = true;
1656   vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector256);
1657   emit_byte(0x6F);
1658   emit_operand(dst, src);
1659 }
1660 
1661 void Assembler::vmovdqu(Address dst, XMMRegister src) {
1662   assert(UseAVX, "");
1663   InstructionMark im(this);
1664   bool vector256 = true;
1665   // swap src<->dst for encoding
1666   assert(src != xnoreg, "sanity");
1667   vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector256);
1668   emit_byte(0x7F);
1669   emit_operand(src, dst);
1670 }
1671 
1672 // Uses zero extension on 64bit
1673 
1674 void Assembler::movl(Register dst, int32_t imm32) {
1675   int encode = prefix_and_encode(dst->encoding());
1676   emit_byte(0xB8 | encode);
1677   emit_long(imm32);
1678 }
1679 
1680 void Assembler::movl(Register dst, Register src) {
1681   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1682   emit_byte(0x8B);
1683   emit_byte(0xC0 | encode);
1684 }
1685 
1686 void Assembler::movl(Register dst, Address src) {
1687   InstructionMark im(this);
1688   prefix(src, dst);
1689   emit_byte(0x8B);
1690   emit_operand(dst, src);
1691 }
1692 
1693 void Assembler::movl(Address dst, int32_t imm32) {
1694   InstructionMark im(this);
1695   prefix(dst);
1696   emit_byte(0xC7);
1697   emit_operand(rax, dst, 4);
1698   emit_long(imm32);
1699 }
1700 
1701 void Assembler::movl(Address dst, Register src) {
1702   InstructionMark im(this);
1703   prefix(dst, src);
1704   emit_byte(0x89);
1705   emit_operand(src, dst);
1706 }
1707 
1708 // New cpus require to use movsd and movss to avoid partial register stall
1709 // when loading from memory. But for old Opteron use movlpd instead of movsd.
1710 // The selection is done in MacroAssembler::movdbl() and movflt().
1711 void Assembler::movlpd(XMMRegister dst, Address src) {
1712   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1713   emit_simd_arith(0x12, dst, src, VEX_SIMD_66);
1714 }
1715 
1716 void Assembler::movq( MMXRegister dst, Address src ) {
1717   assert( VM_Version::supports_mmx(), "" );
1718   emit_byte(0x0F);
1719   emit_byte(0x6F);
1720   emit_operand(dst, src);
1721 }
1722 
1723 void Assembler::movq( Address dst, MMXRegister src ) {
1724   assert( VM_Version::supports_mmx(), "" );
1725   emit_byte(0x0F);
1726   emit_byte(0x7F);
1727   // workaround gcc (3.2.1-7a) bug
1728   // In that version of gcc with only an emit_operand(MMX, Address)
1729   // gcc will tail jump and try and reverse the parameters completely
1730   // obliterating dst in the process. By having a version available
1731   // that doesn't need to swap the args at the tail jump the bug is
1732   // avoided.
1733   emit_operand(dst, src);
1734 }
1735 
1736 void Assembler::movq(XMMRegister dst, Address src) {
1737   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1738   InstructionMark im(this);
1739   simd_prefix(dst, src, VEX_SIMD_F3);
1740   emit_byte(0x7E);
1741   emit_operand(dst, src);
1742 }
1743 
1744 void Assembler::movq(Address dst, XMMRegister src) {
1745   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1746   InstructionMark im(this);
1747   simd_prefix(dst, src, VEX_SIMD_66);
1748   emit_byte(0xD6);
1749   emit_operand(src, dst);
1750 }
1751 
1752 void Assembler::movsbl(Register dst, Address src) { // movsxb
1753   InstructionMark im(this);
1754   prefix(src, dst);
1755   emit_byte(0x0F);
1756   emit_byte(0xBE);
1757   emit_operand(dst, src);
1758 }
1759 
1760 void Assembler::movsbl(Register dst, Register src) { // movsxb
1761   NOT_LP64(assert(src->has_byte_register(), "must have byte register"));
1762   int encode = prefix_and_encode(dst->encoding(), src->encoding(), true);
1763   emit_byte(0x0F);
1764   emit_byte(0xBE);
1765   emit_byte(0xC0 | encode);
1766 }
1767 
1768 void Assembler::movsd(XMMRegister dst, XMMRegister src) {
1769   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1770   emit_simd_arith(0x10, dst, src, VEX_SIMD_F2);
1771 }
1772 
1773 void Assembler::movsd(XMMRegister dst, Address src) {
1774   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1775   emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F2);
1776 }
1777 
1778 void Assembler::movsd(Address dst, XMMRegister src) {
1779   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1780   InstructionMark im(this);
1781   simd_prefix(dst, src, VEX_SIMD_F2);
1782   emit_byte(0x11);
1783   emit_operand(src, dst);
1784 }
1785 
1786 void Assembler::movss(XMMRegister dst, XMMRegister src) {
1787   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1788   emit_simd_arith(0x10, dst, src, VEX_SIMD_F3);
1789 }
1790 
1791 void Assembler::movss(XMMRegister dst, Address src) {
1792   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1793   emit_simd_arith_nonds(0x10, dst, src, VEX_SIMD_F3);
1794 }
1795 
1796 void Assembler::movss(Address dst, XMMRegister src) {
1797   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1798   InstructionMark im(this);
1799   simd_prefix(dst, src, VEX_SIMD_F3);
1800   emit_byte(0x11);
1801   emit_operand(src, dst);
1802 }
1803 
1804 void Assembler::movswl(Register dst, Address src) { // movsxw
1805   InstructionMark im(this);
1806   prefix(src, dst);
1807   emit_byte(0x0F);
1808   emit_byte(0xBF);
1809   emit_operand(dst, src);
1810 }
1811 
1812 void Assembler::movswl(Register dst, Register src) { // movsxw
1813   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1814   emit_byte(0x0F);
1815   emit_byte(0xBF);
1816   emit_byte(0xC0 | encode);
1817 }
1818 
1819 void Assembler::movw(Address dst, int imm16) {
1820   InstructionMark im(this);
1821 
1822   emit_byte(0x66); // switch to 16-bit mode
1823   prefix(dst);
1824   emit_byte(0xC7);
1825   emit_operand(rax, dst, 2);
1826   emit_word(imm16);
1827 }
1828 
1829 void Assembler::movw(Register dst, Address src) {
1830   InstructionMark im(this);
1831   emit_byte(0x66);
1832   prefix(src, dst);
1833   emit_byte(0x8B);
1834   emit_operand(dst, src);
1835 }
1836 
1837 void Assembler::movw(Address dst, Register src) {
1838   InstructionMark im(this);
1839   emit_byte(0x66);
1840   prefix(dst, src);
1841   emit_byte(0x89);
1842   emit_operand(src, dst);
1843 }
1844 
1845 void Assembler::movzbl(Register dst, Address src) { // movzxb
1846   InstructionMark im(this);
1847   prefix(src, dst);
1848   emit_byte(0x0F);
1849   emit_byte(0xB6);
1850   emit_operand(dst, src);
1851 }
1852 
1853 void Assembler::movzbl(Register dst, Register src) { // movzxb
1854   NOT_LP64(assert(src->has_byte_register(), "must have byte register"));
1855   int encode = prefix_and_encode(dst->encoding(), src->encoding(), true);
1856   emit_byte(0x0F);
1857   emit_byte(0xB6);
1858   emit_byte(0xC0 | encode);
1859 }
1860 
1861 void Assembler::movzwl(Register dst, Address src) { // movzxw
1862   InstructionMark im(this);
1863   prefix(src, dst);
1864   emit_byte(0x0F);
1865   emit_byte(0xB7);
1866   emit_operand(dst, src);
1867 }
1868 
1869 void Assembler::movzwl(Register dst, Register src) { // movzxw
1870   int encode = prefix_and_encode(dst->encoding(), src->encoding());
1871   emit_byte(0x0F);
1872   emit_byte(0xB7);
1873   emit_byte(0xC0 | encode);
1874 }
1875 
1876 void Assembler::mull(Address src) {
1877   InstructionMark im(this);
1878   prefix(src);
1879   emit_byte(0xF7);
1880   emit_operand(rsp, src);
1881 }
1882 
1883 void Assembler::mull(Register src) {
1884   int encode = prefix_and_encode(src->encoding());
1885   emit_byte(0xF7);
1886   emit_byte(0xE0 | encode);
1887 }
1888 
1889 void Assembler::mulsd(XMMRegister dst, Address src) {
1890   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1891   emit_simd_arith(0x59, dst, src, VEX_SIMD_F2);
1892 }
1893 
1894 void Assembler::mulsd(XMMRegister dst, XMMRegister src) {
1895   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
1896   emit_simd_arith(0x59, dst, src, VEX_SIMD_F2);
1897 }
1898 
1899 void Assembler::mulss(XMMRegister dst, Address src) {
1900   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1901   emit_simd_arith(0x59, dst, src, VEX_SIMD_F3);
1902 }
1903 
1904 void Assembler::mulss(XMMRegister dst, XMMRegister src) {
1905   NOT_LP64(assert(VM_Version::supports_sse(), ""));
1906   emit_simd_arith(0x59, dst, src, VEX_SIMD_F3);
1907 }
1908 
1909 void Assembler::negl(Register dst) {
1910   int encode = prefix_and_encode(dst->encoding());
1911   emit_byte(0xF7);
1912   emit_byte(0xD8 | encode);
1913 }
1914 
1915 void Assembler::nop(int i) {
1916 #ifdef ASSERT
1917   assert(i > 0, " ");
1918   // The fancy nops aren't currently recognized by debuggers making it a
1919   // pain to disassemble code while debugging. If asserts are on clearly
1920   // speed is not an issue so simply use the single byte traditional nop
1921   // to do alignment.
1922 
1923   for (; i > 0 ; i--) emit_byte(0x90);
1924   return;
1925 
1926 #endif // ASSERT
1927 
1928   if (UseAddressNop && VM_Version::is_intel()) {
1929     //
1930     // Using multi-bytes nops "0x0F 0x1F [address]" for Intel
1931     //  1: 0x90
1932     //  2: 0x66 0x90
1933     //  3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
1934     //  4: 0x0F 0x1F 0x40 0x00
1935     //  5: 0x0F 0x1F 0x44 0x00 0x00
1936     //  6: 0x66 0x0F 0x1F 0x44 0x00 0x00
1937     //  7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
1938     //  8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
1939     //  9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
1940     // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
1941     // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
1942 
1943     // The rest coding is Intel specific - don't use consecutive address nops
1944 
1945     // 12: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
1946     // 13: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
1947     // 14: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
1948     // 15: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x66 0x66 0x66 0x90
1949 
1950     while(i >= 15) {
1951       // For Intel don't generate consecutive addess nops (mix with regular nops)
1952       i -= 15;
1953       emit_byte(0x66);   // size prefix
1954       emit_byte(0x66);   // size prefix
1955       emit_byte(0x66);   // size prefix
1956       addr_nop_8();
1957       emit_byte(0x66);   // size prefix
1958       emit_byte(0x66);   // size prefix
1959       emit_byte(0x66);   // size prefix
1960       emit_byte(0x90);   // nop
1961     }
1962     switch (i) {
1963       case 14:
1964         emit_byte(0x66); // size prefix
1965       case 13:
1966         emit_byte(0x66); // size prefix
1967       case 12:
1968         addr_nop_8();
1969         emit_byte(0x66); // size prefix
1970         emit_byte(0x66); // size prefix
1971         emit_byte(0x66); // size prefix
1972         emit_byte(0x90); // nop
1973         break;
1974       case 11:
1975         emit_byte(0x66); // size prefix
1976       case 10:
1977         emit_byte(0x66); // size prefix
1978       case 9:
1979         emit_byte(0x66); // size prefix
1980       case 8:
1981         addr_nop_8();
1982         break;
1983       case 7:
1984         addr_nop_7();
1985         break;
1986       case 6:
1987         emit_byte(0x66); // size prefix
1988       case 5:
1989         addr_nop_5();
1990         break;
1991       case 4:
1992         addr_nop_4();
1993         break;
1994       case 3:
1995         // Don't use "0x0F 0x1F 0x00" - need patching safe padding
1996         emit_byte(0x66); // size prefix
1997       case 2:
1998         emit_byte(0x66); // size prefix
1999       case 1:
2000         emit_byte(0x90); // nop
2001         break;
2002       default:
2003         assert(i == 0, " ");
2004     }
2005     return;
2006   }
2007   if (UseAddressNop && VM_Version::is_amd()) {
2008     //
2009     // Using multi-bytes nops "0x0F 0x1F [address]" for AMD.
2010     //  1: 0x90
2011     //  2: 0x66 0x90
2012     //  3: 0x66 0x66 0x90 (don't use "0x0F 0x1F 0x00" - need patching safe padding)
2013     //  4: 0x0F 0x1F 0x40 0x00
2014     //  5: 0x0F 0x1F 0x44 0x00 0x00
2015     //  6: 0x66 0x0F 0x1F 0x44 0x00 0x00
2016     //  7: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2017     //  8: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2018     //  9: 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2019     // 10: 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2020     // 11: 0x66 0x66 0x66 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2021 
2022     // The rest coding is AMD specific - use consecutive address nops
2023 
2024     // 12: 0x66 0x0F 0x1F 0x44 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2025     // 13: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x66 0x0F 0x1F 0x44 0x00 0x00
2026     // 14: 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2027     // 15: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x80 0x00 0x00 0x00 0x00
2028     // 16: 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00 0x0F 0x1F 0x84 0x00 0x00 0x00 0x00 0x00
2029     //     Size prefixes (0x66) are added for larger sizes
2030 
2031     while(i >= 22) {
2032       i -= 11;
2033       emit_byte(0x66); // size prefix
2034       emit_byte(0x66); // size prefix
2035       emit_byte(0x66); // size prefix
2036       addr_nop_8();
2037     }
2038     // Generate first nop for size between 21-12
2039     switch (i) {
2040       case 21:
2041         i -= 1;
2042         emit_byte(0x66); // size prefix
2043       case 20:
2044       case 19:
2045         i -= 1;
2046         emit_byte(0x66); // size prefix
2047       case 18:
2048       case 17:
2049         i -= 1;
2050         emit_byte(0x66); // size prefix
2051       case 16:
2052       case 15:
2053         i -= 8;
2054         addr_nop_8();
2055         break;
2056       case 14:
2057       case 13:
2058         i -= 7;
2059         addr_nop_7();
2060         break;
2061       case 12:
2062         i -= 6;
2063         emit_byte(0x66); // size prefix
2064         addr_nop_5();
2065         break;
2066       default:
2067         assert(i < 12, " ");
2068     }
2069 
2070     // Generate second nop for size between 11-1
2071     switch (i) {
2072       case 11:
2073         emit_byte(0x66); // size prefix
2074       case 10:
2075         emit_byte(0x66); // size prefix
2076       case 9:
2077         emit_byte(0x66); // size prefix
2078       case 8:
2079         addr_nop_8();
2080         break;
2081       case 7:
2082         addr_nop_7();
2083         break;
2084       case 6:
2085         emit_byte(0x66); // size prefix
2086       case 5:
2087         addr_nop_5();
2088         break;
2089       case 4:
2090         addr_nop_4();
2091         break;
2092       case 3:
2093         // Don't use "0x0F 0x1F 0x00" - need patching safe padding
2094         emit_byte(0x66); // size prefix
2095       case 2:
2096         emit_byte(0x66); // size prefix
2097       case 1:
2098         emit_byte(0x90); // nop
2099         break;
2100       default:
2101         assert(i == 0, " ");
2102     }
2103     return;
2104   }
2105 
2106   // Using nops with size prefixes "0x66 0x90".
2107   // From AMD Optimization Guide:
2108   //  1: 0x90
2109   //  2: 0x66 0x90
2110   //  3: 0x66 0x66 0x90
2111   //  4: 0x66 0x66 0x66 0x90
2112   //  5: 0x66 0x66 0x90 0x66 0x90
2113   //  6: 0x66 0x66 0x90 0x66 0x66 0x90
2114   //  7: 0x66 0x66 0x66 0x90 0x66 0x66 0x90
2115   //  8: 0x66 0x66 0x66 0x90 0x66 0x66 0x66 0x90
2116   //  9: 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2117   // 10: 0x66 0x66 0x66 0x90 0x66 0x66 0x90 0x66 0x66 0x90
2118   //
2119   while(i > 12) {
2120     i -= 4;
2121     emit_byte(0x66); // size prefix
2122     emit_byte(0x66);
2123     emit_byte(0x66);
2124     emit_byte(0x90); // nop
2125   }
2126   // 1 - 12 nops
2127   if(i > 8) {
2128     if(i > 9) {
2129       i -= 1;
2130       emit_byte(0x66);
2131     }
2132     i -= 3;
2133     emit_byte(0x66);
2134     emit_byte(0x66);
2135     emit_byte(0x90);
2136   }
2137   // 1 - 8 nops
2138   if(i > 4) {
2139     if(i > 6) {
2140       i -= 1;
2141       emit_byte(0x66);
2142     }
2143     i -= 3;
2144     emit_byte(0x66);
2145     emit_byte(0x66);
2146     emit_byte(0x90);
2147   }
2148   switch (i) {
2149     case 4:
2150       emit_byte(0x66);
2151     case 3:
2152       emit_byte(0x66);
2153     case 2:
2154       emit_byte(0x66);
2155     case 1:
2156       emit_byte(0x90);
2157       break;
2158     default:
2159       assert(i == 0, " ");
2160   }
2161 }
2162 
2163 void Assembler::notl(Register dst) {
2164   int encode = prefix_and_encode(dst->encoding());
2165   emit_byte(0xF7);
2166   emit_byte(0xD0 | encode );
2167 }
2168 
2169 void Assembler::orl(Address dst, int32_t imm32) {
2170   InstructionMark im(this);
2171   prefix(dst);
2172   emit_arith_operand(0x81, rcx, dst, imm32);
2173 }
2174 
2175 void Assembler::orl(Register dst, int32_t imm32) {
2176   prefix(dst);
2177   emit_arith(0x81, 0xC8, dst, imm32);
2178 }
2179 
2180 void Assembler::orl(Register dst, Address src) {
2181   InstructionMark im(this);
2182   prefix(src, dst);
2183   emit_byte(0x0B);
2184   emit_operand(dst, src);
2185 }
2186 
2187 void Assembler::orl(Register dst, Register src) {
2188   (void) prefix_and_encode(dst->encoding(), src->encoding());
2189   emit_arith(0x0B, 0xC0, dst, src);
2190 }
2191 
2192 void Assembler::packuswb(XMMRegister dst, Address src) {
2193   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2194   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2195   emit_simd_arith(0x67, dst, src, VEX_SIMD_66);
2196 }
2197 
2198 void Assembler::packuswb(XMMRegister dst, XMMRegister src) {
2199   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2200   emit_simd_arith(0x67, dst, src, VEX_SIMD_66);
2201 }
2202 
2203 void Assembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
2204   assert(VM_Version::supports_sse4_2(), "");
2205   InstructionMark im(this);
2206   simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
2207   emit_byte(0x61);
2208   emit_operand(dst, src);
2209   emit_byte(imm8);
2210 }
2211 
2212 void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
2213   assert(VM_Version::supports_sse4_2(), "");
2214   int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
2215   emit_byte(0x61);
2216   emit_byte(0xC0 | encode);
2217   emit_byte(imm8);
2218 }
2219 
2220 void Assembler::pmovzxbw(XMMRegister dst, Address src) {
2221   assert(VM_Version::supports_sse4_1(), "");
2222   InstructionMark im(this);
2223   simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
2224   emit_byte(0x30);
2225   emit_operand(dst, src);
2226 }
2227 
2228 void Assembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
2229   assert(VM_Version::supports_sse4_1(), "");
2230   int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
2231   emit_byte(0x30);
2232   emit_byte(0xC0 | encode);
2233 }
2234 
2235 // generic
2236 void Assembler::pop(Register dst) {
2237   int encode = prefix_and_encode(dst->encoding());
2238   emit_byte(0x58 | encode);
2239 }
2240 
2241 void Assembler::popcntl(Register dst, Address src) {
2242   assert(VM_Version::supports_popcnt(), "must support");
2243   InstructionMark im(this);
2244   emit_byte(0xF3);
2245   prefix(src, dst);
2246   emit_byte(0x0F);
2247   emit_byte(0xB8);
2248   emit_operand(dst, src);
2249 }
2250 
2251 void Assembler::popcntl(Register dst, Register src) {
2252   assert(VM_Version::supports_popcnt(), "must support");
2253   emit_byte(0xF3);
2254   int encode = prefix_and_encode(dst->encoding(), src->encoding());
2255   emit_byte(0x0F);
2256   emit_byte(0xB8);
2257   emit_byte(0xC0 | encode);
2258 }
2259 
2260 void Assembler::popf() {
2261   emit_byte(0x9D);
2262 }
2263 
2264 #ifndef _LP64 // no 32bit push/pop on amd64
2265 void Assembler::popl(Address dst) {
2266   // NOTE: this will adjust stack by 8byte on 64bits
2267   InstructionMark im(this);
2268   prefix(dst);
2269   emit_byte(0x8F);
2270   emit_operand(rax, dst);
2271 }
2272 #endif
2273 
2274 void Assembler::prefetch_prefix(Address src) {
2275   prefix(src);
2276   emit_byte(0x0F);
2277 }
2278 
2279 void Assembler::prefetchnta(Address src) {
2280   NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2281   InstructionMark im(this);
2282   prefetch_prefix(src);
2283   emit_byte(0x18);
2284   emit_operand(rax, src); // 0, src
2285 }
2286 
2287 void Assembler::prefetchr(Address src) {
2288   assert(VM_Version::supports_3dnow_prefetch(), "must support");
2289   InstructionMark im(this);
2290   prefetch_prefix(src);
2291   emit_byte(0x0D);
2292   emit_operand(rax, src); // 0, src
2293 }
2294 
2295 void Assembler::prefetcht0(Address src) {
2296   NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2297   InstructionMark im(this);
2298   prefetch_prefix(src);
2299   emit_byte(0x18);
2300   emit_operand(rcx, src); // 1, src
2301 }
2302 
2303 void Assembler::prefetcht1(Address src) {
2304   NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2305   InstructionMark im(this);
2306   prefetch_prefix(src);
2307   emit_byte(0x18);
2308   emit_operand(rdx, src); // 2, src
2309 }
2310 
2311 void Assembler::prefetcht2(Address src) {
2312   NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
2313   InstructionMark im(this);
2314   prefetch_prefix(src);
2315   emit_byte(0x18);
2316   emit_operand(rbx, src); // 3, src
2317 }
2318 
2319 void Assembler::prefetchw(Address src) {
2320   assert(VM_Version::supports_3dnow_prefetch(), "must support");
2321   InstructionMark im(this);
2322   prefetch_prefix(src);
2323   emit_byte(0x0D);
2324   emit_operand(rcx, src); // 1, src
2325 }
2326 
2327 void Assembler::prefix(Prefix p) {
2328   a_byte(p);
2329 }
2330 
2331 void Assembler::pshufd(XMMRegister dst, XMMRegister src, int mode) {
2332   assert(isByte(mode), "invalid value");
2333   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2334   emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_66);
2335   emit_byte(mode & 0xFF);
2336 
2337 }
2338 
2339 void Assembler::pshufd(XMMRegister dst, Address src, int mode) {
2340   assert(isByte(mode), "invalid value");
2341   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2342   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2343   InstructionMark im(this);
2344   simd_prefix(dst, src, VEX_SIMD_66);
2345   emit_byte(0x70);
2346   emit_operand(dst, src);
2347   emit_byte(mode & 0xFF);
2348 }
2349 
2350 void Assembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
2351   assert(isByte(mode), "invalid value");
2352   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2353   emit_simd_arith_nonds(0x70, dst, src, VEX_SIMD_F2);
2354   emit_byte(mode & 0xFF);
2355 }
2356 
2357 void Assembler::pshuflw(XMMRegister dst, Address src, int mode) {
2358   assert(isByte(mode), "invalid value");
2359   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2360   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2361   InstructionMark im(this);
2362   simd_prefix(dst, src, VEX_SIMD_F2);
2363   emit_byte(0x70);
2364   emit_operand(dst, src);
2365   emit_byte(mode & 0xFF);
2366 }
2367 
2368 void Assembler::psrldq(XMMRegister dst, int shift) {
2369   // Shift 128 bit value in xmm register by number of bytes.
2370   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2371   int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66);
2372   emit_byte(0x73);
2373   emit_byte(0xC0 | encode);
2374   emit_byte(shift);
2375 }
2376 
2377 void Assembler::ptest(XMMRegister dst, Address src) {
2378   assert(VM_Version::supports_sse4_1(), "");
2379   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2380   InstructionMark im(this);
2381   simd_prefix(dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
2382   emit_byte(0x17);
2383   emit_operand(dst, src);
2384 }
2385 
2386 void Assembler::ptest(XMMRegister dst, XMMRegister src) {
2387   assert(VM_Version::supports_sse4_1(), "");
2388   int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
2389   emit_byte(0x17);
2390   emit_byte(0xC0 | encode);
2391 }
2392 
2393 void Assembler::punpcklbw(XMMRegister dst, Address src) {
2394   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2395   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2396   emit_simd_arith(0x60, dst, src, VEX_SIMD_66);
2397 }
2398 
2399 void Assembler::punpcklbw(XMMRegister dst, XMMRegister src) {
2400   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2401   emit_simd_arith(0x60, dst, src, VEX_SIMD_66);
2402 }
2403 
2404 void Assembler::punpckldq(XMMRegister dst, Address src) {
2405   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2406   assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
2407   emit_simd_arith(0x62, dst, src, VEX_SIMD_66);
2408 }
2409 
2410 void Assembler::punpckldq(XMMRegister dst, XMMRegister src) {
2411   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2412   emit_simd_arith(0x62, dst, src, VEX_SIMD_66);
2413 }
2414 
2415 void Assembler::punpcklqdq(XMMRegister dst, XMMRegister src) {
2416   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2417   emit_simd_arith(0x6C, dst, src, VEX_SIMD_66);
2418 }
2419 
2420 void Assembler::push(int32_t imm32) {
2421   // in 64bits we push 64bits onto the stack but only
2422   // take a 32bit immediate
2423   emit_byte(0x68);
2424   emit_long(imm32);
2425 }
2426 
2427 void Assembler::push(Register src) {
2428   int encode = prefix_and_encode(src->encoding());
2429 
2430   emit_byte(0x50 | encode);
2431 }
2432 
2433 void Assembler::pushf() {
2434   emit_byte(0x9C);
2435 }
2436 
2437 #ifndef _LP64 // no 32bit push/pop on amd64
2438 void Assembler::pushl(Address src) {
2439   // Note this will push 64bit on 64bit
2440   InstructionMark im(this);
2441   prefix(src);
2442   emit_byte(0xFF);
2443   emit_operand(rsi, src);
2444 }
2445 #endif
2446 
2447 void Assembler::rcll(Register dst, int imm8) {
2448   assert(isShiftCount(imm8), "illegal shift count");
2449   int encode = prefix_and_encode(dst->encoding());
2450   if (imm8 == 1) {
2451     emit_byte(0xD1);
2452     emit_byte(0xD0 | encode);
2453   } else {
2454     emit_byte(0xC1);
2455     emit_byte(0xD0 | encode);
2456     emit_byte(imm8);
2457   }
2458 }
2459 
2460 // copies data from [esi] to [edi] using rcx pointer sized words
2461 // generic
2462 void Assembler::rep_mov() {
2463   emit_byte(0xF3);
2464   // MOVSQ
2465   LP64_ONLY(prefix(REX_W));
2466   emit_byte(0xA5);
2467 }
2468 
2469 // sets rcx pointer sized words with rax, value at [edi]
2470 // generic
2471 void Assembler::rep_set() { // rep_set
2472   emit_byte(0xF3);
2473   // STOSQ
2474   LP64_ONLY(prefix(REX_W));
2475   emit_byte(0xAB);
2476 }
2477 
2478 // scans rcx pointer sized words at [edi] for occurance of rax,
2479 // generic
2480 void Assembler::repne_scan() { // repne_scan
2481   emit_byte(0xF2);
2482   // SCASQ
2483   LP64_ONLY(prefix(REX_W));
2484   emit_byte(0xAF);
2485 }
2486 
2487 #ifdef _LP64
2488 // scans rcx 4 byte words at [edi] for occurance of rax,
2489 // generic
2490 void Assembler::repne_scanl() { // repne_scan
2491   emit_byte(0xF2);
2492   // SCASL
2493   emit_byte(0xAF);
2494 }
2495 #endif
2496 
2497 void Assembler::ret(int imm16) {
2498   if (imm16 == 0) {
2499     emit_byte(0xC3);
2500   } else {
2501     emit_byte(0xC2);
2502     emit_word(imm16);
2503   }
2504 }
2505 
2506 void Assembler::sahf() {
2507 #ifdef _LP64
2508   // Not supported in 64bit mode
2509   ShouldNotReachHere();
2510 #endif
2511   emit_byte(0x9E);
2512 }
2513 
2514 void Assembler::sarl(Register dst, int imm8) {
2515   int encode = prefix_and_encode(dst->encoding());
2516   assert(isShiftCount(imm8), "illegal shift count");
2517   if (imm8 == 1) {
2518     emit_byte(0xD1);
2519     emit_byte(0xF8 | encode);
2520   } else {
2521     emit_byte(0xC1);
2522     emit_byte(0xF8 | encode);
2523     emit_byte(imm8);
2524   }
2525 }
2526 
2527 void Assembler::sarl(Register dst) {
2528   int encode = prefix_and_encode(dst->encoding());
2529   emit_byte(0xD3);
2530   emit_byte(0xF8 | encode);
2531 }
2532 
2533 void Assembler::sbbl(Address dst, int32_t imm32) {
2534   InstructionMark im(this);
2535   prefix(dst);
2536   emit_arith_operand(0x81, rbx, dst, imm32);
2537 }
2538 
2539 void Assembler::sbbl(Register dst, int32_t imm32) {
2540   prefix(dst);
2541   emit_arith(0x81, 0xD8, dst, imm32);
2542 }
2543 
2544 
2545 void Assembler::sbbl(Register dst, Address src) {
2546   InstructionMark im(this);
2547   prefix(src, dst);
2548   emit_byte(0x1B);
2549   emit_operand(dst, src);
2550 }
2551 
2552 void Assembler::sbbl(Register dst, Register src) {
2553   (void) prefix_and_encode(dst->encoding(), src->encoding());
2554   emit_arith(0x1B, 0xC0, dst, src);
2555 }
2556 
2557 void Assembler::setb(Condition cc, Register dst) {
2558   assert(0 <= cc && cc < 16, "illegal cc");
2559   int encode = prefix_and_encode(dst->encoding(), true);
2560   emit_byte(0x0F);
2561   emit_byte(0x90 | cc);
2562   emit_byte(0xC0 | encode);
2563 }
2564 
2565 void Assembler::shll(Register dst, int imm8) {
2566   assert(isShiftCount(imm8), "illegal shift count");
2567   int encode = prefix_and_encode(dst->encoding());
2568   if (imm8 == 1 ) {
2569     emit_byte(0xD1);
2570     emit_byte(0xE0 | encode);
2571   } else {
2572     emit_byte(0xC1);
2573     emit_byte(0xE0 | encode);
2574     emit_byte(imm8);
2575   }
2576 }
2577 
2578 void Assembler::shll(Register dst) {
2579   int encode = prefix_and_encode(dst->encoding());
2580   emit_byte(0xD3);
2581   emit_byte(0xE0 | encode);
2582 }
2583 
2584 void Assembler::shrl(Register dst, int imm8) {
2585   assert(isShiftCount(imm8), "illegal shift count");
2586   int encode = prefix_and_encode(dst->encoding());
2587   emit_byte(0xC1);
2588   emit_byte(0xE8 | encode);
2589   emit_byte(imm8);
2590 }
2591 
2592 void Assembler::shrl(Register dst) {
2593   int encode = prefix_and_encode(dst->encoding());
2594   emit_byte(0xD3);
2595   emit_byte(0xE8 | encode);
2596 }
2597 
2598 // copies a single word from [esi] to [edi]
2599 void Assembler::smovl() {
2600   emit_byte(0xA5);
2601 }
2602 
2603 void Assembler::sqrtsd(XMMRegister dst, XMMRegister src) {
2604   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2605   emit_simd_arith(0x51, dst, src, VEX_SIMD_F2);
2606 }
2607 
2608 void Assembler::sqrtsd(XMMRegister dst, Address src) {
2609   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2610   emit_simd_arith(0x51, dst, src, VEX_SIMD_F2);
2611 }
2612 
2613 void Assembler::sqrtss(XMMRegister dst, XMMRegister src) {
2614   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2615   emit_simd_arith(0x51, dst, src, VEX_SIMD_F3);
2616 }
2617 
2618 void Assembler::sqrtss(XMMRegister dst, Address src) {
2619   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2620   emit_simd_arith(0x51, dst, src, VEX_SIMD_F3);
2621 }
2622 
2623 void Assembler::stmxcsr( Address dst) {
2624   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2625   InstructionMark im(this);
2626   prefix(dst);
2627   emit_byte(0x0F);
2628   emit_byte(0xAE);
2629   emit_operand(as_Register(3), dst);
2630 }
2631 
2632 void Assembler::subl(Address dst, int32_t imm32) {
2633   InstructionMark im(this);
2634   prefix(dst);
2635   emit_arith_operand(0x81, rbp, dst, imm32);
2636 }
2637 
2638 void Assembler::subl(Address dst, Register src) {
2639   InstructionMark im(this);
2640   prefix(dst, src);
2641   emit_byte(0x29);
2642   emit_operand(src, dst);
2643 }
2644 
2645 void Assembler::subl(Register dst, int32_t imm32) {
2646   prefix(dst);
2647   emit_arith(0x81, 0xE8, dst, imm32);
2648 }
2649 
2650 // Force generation of a 4 byte immediate value even if it fits into 8bit
2651 void Assembler::subl_imm32(Register dst, int32_t imm32) {
2652   prefix(dst);
2653   emit_arith_imm32(0x81, 0xE8, dst, imm32);
2654 }
2655 
2656 void Assembler::subl(Register dst, Address src) {
2657   InstructionMark im(this);
2658   prefix(src, dst);
2659   emit_byte(0x2B);
2660   emit_operand(dst, src);
2661 }
2662 
2663 void Assembler::subl(Register dst, Register src) {
2664   (void) prefix_and_encode(dst->encoding(), src->encoding());
2665   emit_arith(0x2B, 0xC0, dst, src);
2666 }
2667 
2668 void Assembler::subsd(XMMRegister dst, XMMRegister src) {
2669   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2670   emit_simd_arith(0x5C, dst, src, VEX_SIMD_F2);
2671 }
2672 
2673 void Assembler::subsd(XMMRegister dst, Address src) {
2674   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2675   emit_simd_arith(0x5C, dst, src, VEX_SIMD_F2);
2676 }
2677 
2678 void Assembler::subss(XMMRegister dst, XMMRegister src) {
2679   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2680   emit_simd_arith(0x5C, dst, src, VEX_SIMD_F3);
2681 }
2682 
2683 void Assembler::subss(XMMRegister dst, Address src) {
2684   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2685   emit_simd_arith(0x5C, dst, src, VEX_SIMD_F3);
2686 }
2687 
2688 void Assembler::testb(Register dst, int imm8) {
2689   NOT_LP64(assert(dst->has_byte_register(), "must have byte register"));
2690   (void) prefix_and_encode(dst->encoding(), true);
2691   emit_arith_b(0xF6, 0xC0, dst, imm8);
2692 }
2693 
2694 void Assembler::testl(Register dst, int32_t imm32) {
2695   // not using emit_arith because test
2696   // doesn't support sign-extension of
2697   // 8bit operands
2698   int encode = dst->encoding();
2699   if (encode == 0) {
2700     emit_byte(0xA9);
2701   } else {
2702     encode = prefix_and_encode(encode);
2703     emit_byte(0xF7);
2704     emit_byte(0xC0 | encode);
2705   }
2706   emit_long(imm32);
2707 }
2708 
2709 void Assembler::testl(Register dst, Register src) {
2710   (void) prefix_and_encode(dst->encoding(), src->encoding());
2711   emit_arith(0x85, 0xC0, dst, src);
2712 }
2713 
2714 void Assembler::testl(Register dst, Address  src) {
2715   InstructionMark im(this);
2716   prefix(src, dst);
2717   emit_byte(0x85);
2718   emit_operand(dst, src);
2719 }
2720 
2721 void Assembler::ucomisd(XMMRegister dst, Address src) {
2722   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2723   emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66);
2724 }
2725 
2726 void Assembler::ucomisd(XMMRegister dst, XMMRegister src) {
2727   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2728   emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66);
2729 }
2730 
2731 void Assembler::ucomiss(XMMRegister dst, Address src) {
2732   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2733   emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE);
2734 }
2735 
2736 void Assembler::ucomiss(XMMRegister dst, XMMRegister src) {
2737   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2738   emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_NONE);
2739 }
2740 
2741 
2742 void Assembler::xaddl(Address dst, Register src) {
2743   InstructionMark im(this);
2744   prefix(dst, src);
2745   emit_byte(0x0F);
2746   emit_byte(0xC1);
2747   emit_operand(src, dst);
2748 }
2749 
2750 void Assembler::xchgl(Register dst, Address src) { // xchg
2751   InstructionMark im(this);
2752   prefix(src, dst);
2753   emit_byte(0x87);
2754   emit_operand(dst, src);
2755 }
2756 
2757 void Assembler::xchgl(Register dst, Register src) {
2758   int encode = prefix_and_encode(dst->encoding(), src->encoding());
2759   emit_byte(0x87);
2760   emit_byte(0xc0 | encode);
2761 }
2762 
2763 void Assembler::xorl(Register dst, int32_t imm32) {
2764   prefix(dst);
2765   emit_arith(0x81, 0xF0, dst, imm32);
2766 }
2767 
2768 void Assembler::xorl(Register dst, Address src) {
2769   InstructionMark im(this);
2770   prefix(src, dst);
2771   emit_byte(0x33);
2772   emit_operand(dst, src);
2773 }
2774 
2775 void Assembler::xorl(Register dst, Register src) {
2776   (void) prefix_and_encode(dst->encoding(), src->encoding());
2777   emit_arith(0x33, 0xC0, dst, src);
2778 }
2779 
2780 
2781 // AVX 3-operands scalar float-point arithmetic instructions
2782 
2783 void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, Address src) {
2784   assert(VM_Version::supports_avx(), "");
2785   emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
2786 }
2787 
2788 void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2789   assert(VM_Version::supports_avx(), "");
2790   emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
2791 }
2792 
2793 void Assembler::vaddss(XMMRegister dst, XMMRegister nds, Address src) {
2794   assert(VM_Version::supports_avx(), "");
2795   emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
2796 }
2797 
2798 void Assembler::vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2799   assert(VM_Version::supports_avx(), "");
2800   emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
2801 }
2802 
2803 void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, Address src) {
2804   assert(VM_Version::supports_avx(), "");
2805   emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
2806 }
2807 
2808 void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2809   assert(VM_Version::supports_avx(), "");
2810   emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
2811 }
2812 
2813 void Assembler::vdivss(XMMRegister dst, XMMRegister nds, Address src) {
2814   assert(VM_Version::supports_avx(), "");
2815   emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
2816 }
2817 
2818 void Assembler::vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2819   assert(VM_Version::supports_avx(), "");
2820   emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
2821 }
2822 
2823 void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) {
2824   assert(VM_Version::supports_avx(), "");
2825   emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
2826 }
2827 
2828 void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2829   assert(VM_Version::supports_avx(), "");
2830   emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
2831 }
2832 
2833 void Assembler::vmulss(XMMRegister dst, XMMRegister nds, Address src) {
2834   assert(VM_Version::supports_avx(), "");
2835   emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
2836 }
2837 
2838 void Assembler::vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2839   assert(VM_Version::supports_avx(), "");
2840   emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
2841 }
2842 
2843 void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, Address src) {
2844   assert(VM_Version::supports_avx(), "");
2845   emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
2846 }
2847 
2848 void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2849   assert(VM_Version::supports_avx(), "");
2850   emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F2, /* vector256 */ false);
2851 }
2852 
2853 void Assembler::vsubss(XMMRegister dst, XMMRegister nds, Address src) {
2854   assert(VM_Version::supports_avx(), "");
2855   emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
2856 }
2857 
2858 void Assembler::vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
2859   assert(VM_Version::supports_avx(), "");
2860   emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_F3, /* vector256 */ false);
2861 }
2862 
2863 //====================VECTOR ARITHMETIC=====================================
2864 
2865 // Float-point vector arithmetic
2866 
2867 void Assembler::addpd(XMMRegister dst, XMMRegister src) {
2868   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2869   emit_simd_arith(0x58, dst, src, VEX_SIMD_66);
2870 }
2871 
2872 void Assembler::addps(XMMRegister dst, XMMRegister src) {
2873   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2874   emit_simd_arith(0x58, dst, src, VEX_SIMD_NONE);
2875 }
2876 
2877 void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
2878   assert(VM_Version::supports_avx(), "");
2879   emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_66, vector256);
2880 }
2881 
2882 void Assembler::vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
2883   assert(VM_Version::supports_avx(), "");
2884   emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector256);
2885 }
2886 
2887 void Assembler::vaddpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
2888   assert(VM_Version::supports_avx(), "");
2889   emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_66, vector256);
2890 }
2891 
2892 void Assembler::vaddps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
2893   assert(VM_Version::supports_avx(), "");
2894   emit_vex_arith(0x58, dst, nds, src, VEX_SIMD_NONE, vector256);
2895 }
2896 
2897 void Assembler::subpd(XMMRegister dst, XMMRegister src) {
2898   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2899   emit_simd_arith(0x5C, dst, src, VEX_SIMD_66);
2900 }
2901 
2902 void Assembler::subps(XMMRegister dst, XMMRegister src) {
2903   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2904   emit_simd_arith(0x5C, dst, src, VEX_SIMD_NONE);
2905 }
2906 
2907 void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
2908   assert(VM_Version::supports_avx(), "");
2909   emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_66, vector256);
2910 }
2911 
2912 void Assembler::vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
2913   assert(VM_Version::supports_avx(), "");
2914   emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector256);
2915 }
2916 
2917 void Assembler::vsubpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
2918   assert(VM_Version::supports_avx(), "");
2919   emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_66, vector256);
2920 }
2921 
2922 void Assembler::vsubps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
2923   assert(VM_Version::supports_avx(), "");
2924   emit_vex_arith(0x5C, dst, nds, src, VEX_SIMD_NONE, vector256);
2925 }
2926 
2927 void Assembler::mulpd(XMMRegister dst, XMMRegister src) {
2928   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2929   emit_simd_arith(0x59, dst, src, VEX_SIMD_66);
2930 }
2931 
2932 void Assembler::mulps(XMMRegister dst, XMMRegister src) {
2933   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2934   emit_simd_arith(0x59, dst, src, VEX_SIMD_NONE);
2935 }
2936 
2937 void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
2938   assert(VM_Version::supports_avx(), "");
2939   emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_66, vector256);
2940 }
2941 
2942 void Assembler::vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
2943   assert(VM_Version::supports_avx(), "");
2944   emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector256);
2945 }
2946 
2947 void Assembler::vmulpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
2948   assert(VM_Version::supports_avx(), "");
2949   emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_66, vector256);
2950 }
2951 
2952 void Assembler::vmulps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
2953   assert(VM_Version::supports_avx(), "");
2954   emit_vex_arith(0x59, dst, nds, src, VEX_SIMD_NONE, vector256);
2955 }
2956 
2957 void Assembler::divpd(XMMRegister dst, XMMRegister src) {
2958   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2959   emit_simd_arith(0x5E, dst, src, VEX_SIMD_66);
2960 }
2961 
2962 void Assembler::divps(XMMRegister dst, XMMRegister src) {
2963   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2964   emit_simd_arith(0x5E, dst, src, VEX_SIMD_NONE);
2965 }
2966 
2967 void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
2968   assert(VM_Version::supports_avx(), "");
2969   emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_66, vector256);
2970 }
2971 
2972 void Assembler::vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
2973   assert(VM_Version::supports_avx(), "");
2974   emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector256);
2975 }
2976 
2977 void Assembler::vdivpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
2978   assert(VM_Version::supports_avx(), "");
2979   emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_66, vector256);
2980 }
2981 
2982 void Assembler::vdivps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
2983   assert(VM_Version::supports_avx(), "");
2984   emit_vex_arith(0x5E, dst, nds, src, VEX_SIMD_NONE, vector256);
2985 }
2986 
2987 void Assembler::andpd(XMMRegister dst, XMMRegister src) {
2988   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
2989   emit_simd_arith(0x54, dst, src, VEX_SIMD_66);
2990 }
2991 
2992 void Assembler::andps(XMMRegister dst, XMMRegister src) {
2993   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2994   emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE);
2995 }
2996 
2997 void Assembler::andps(XMMRegister dst, Address src) {
2998   NOT_LP64(assert(VM_Version::supports_sse(), ""));
2999   emit_simd_arith(0x54, dst, src, VEX_SIMD_NONE);
3000 }
3001 
3002 void Assembler::andpd(XMMRegister dst, Address src) {
3003   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3004   emit_simd_arith(0x54, dst, src, VEX_SIMD_66);
3005 }
3006 
3007 void Assembler::vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3008   assert(VM_Version::supports_avx(), "");
3009   emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector256);
3010 }
3011 
3012 void Assembler::vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3013   assert(VM_Version::supports_avx(), "");
3014   emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector256);
3015 }
3016 
3017 void Assembler::vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3018   assert(VM_Version::supports_avx(), "");
3019   emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_66, vector256);
3020 }
3021 
3022 void Assembler::vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3023   assert(VM_Version::supports_avx(), "");
3024   emit_vex_arith(0x54, dst, nds, src, VEX_SIMD_NONE, vector256);
3025 }
3026 
3027 void Assembler::xorpd(XMMRegister dst, XMMRegister src) {
3028   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3029   emit_simd_arith(0x57, dst, src, VEX_SIMD_66);
3030 }
3031 
3032 void Assembler::xorps(XMMRegister dst, XMMRegister src) {
3033   NOT_LP64(assert(VM_Version::supports_sse(), ""));
3034   emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE);
3035 }
3036 
3037 void Assembler::xorpd(XMMRegister dst, Address src) {
3038   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3039   emit_simd_arith(0x57, dst, src, VEX_SIMD_66);
3040 }
3041 
3042 void Assembler::xorps(XMMRegister dst, Address src) {
3043   NOT_LP64(assert(VM_Version::supports_sse(), ""));
3044   emit_simd_arith(0x57, dst, src, VEX_SIMD_NONE);
3045 }
3046 
3047 void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3048   assert(VM_Version::supports_avx(), "");
3049   emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector256);
3050 }
3051 
3052 void Assembler::vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3053   assert(VM_Version::supports_avx(), "");
3054   emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector256);
3055 }
3056 
3057 void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3058   assert(VM_Version::supports_avx(), "");
3059   emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_66, vector256);
3060 }
3061 
3062 void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3063   assert(VM_Version::supports_avx(), "");
3064   emit_vex_arith(0x57, dst, nds, src, VEX_SIMD_NONE, vector256);
3065 }
3066 
3067 
3068 // Integer vector arithmetic
3069 void Assembler::paddb(XMMRegister dst, XMMRegister src) {
3070   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3071   emit_simd_arith(0xFC, dst, src, VEX_SIMD_66);
3072 }
3073 
3074 void Assembler::paddw(XMMRegister dst, XMMRegister src) {
3075   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3076   emit_simd_arith(0xFD, dst, src, VEX_SIMD_66);
3077 }
3078 
3079 void Assembler::paddd(XMMRegister dst, XMMRegister src) {
3080   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3081   emit_simd_arith(0xFE, dst, src, VEX_SIMD_66);
3082 }
3083 
3084 void Assembler::paddq(XMMRegister dst, XMMRegister src) {
3085   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3086   emit_simd_arith(0xD4, dst, src, VEX_SIMD_66);
3087 }
3088 
3089 void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3090   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3091   emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector256);
3092 }
3093 
3094 void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3095   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3096   emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector256);
3097 }
3098 
3099 void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3100   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3101   emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector256);
3102 }
3103 
3104 void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3105   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3106   emit_vex_arith(0xD4, dst, nds, src, VEX_SIMD_66, vector256);
3107 }
3108 
3109 void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3110   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3111   emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector256);
3112 }
3113 
3114 void Assembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3115   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3116   emit_vex_arith(0xFD, dst, nds, src, VEX_SIMD_66, vector256);
3117 }
3118 
3119 void Assembler::vpaddd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3120   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3121   emit_vex_arith(0xFE, dst, nds, src, VEX_SIMD_66, vector256);
3122 }
3123 
3124 void Assembler::vpaddq(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3125   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3126   emit_vex_arith(0xD4, dst, nds, src, VEX_SIMD_66, vector256);
3127 }
3128 
3129 void Assembler::psubb(XMMRegister dst, XMMRegister src) {
3130   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3131   emit_simd_arith(0xF8, dst, src, VEX_SIMD_66);
3132 }
3133 
3134 void Assembler::psubw(XMMRegister dst, XMMRegister src) {
3135   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3136   emit_simd_arith(0xF9, dst, src, VEX_SIMD_66);
3137 }
3138 
3139 void Assembler::psubd(XMMRegister dst, XMMRegister src) {
3140   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3141   emit_simd_arith(0xFA, dst, src, VEX_SIMD_66);
3142 }
3143 
3144 void Assembler::psubq(XMMRegister dst, XMMRegister src) {
3145   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3146   emit_simd_arith(0xFB, dst, src, VEX_SIMD_66);
3147 }
3148 
3149 void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3150   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3151   emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector256);
3152 }
3153 
3154 void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3155   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3156   emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector256);
3157 }
3158 
3159 void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3160   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3161   emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector256);
3162 }
3163 
3164 void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3165   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3166   emit_vex_arith(0xFB, dst, nds, src, VEX_SIMD_66, vector256);
3167 }
3168 
3169 void Assembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3170   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3171   emit_vex_arith(0xF8, dst, nds, src, VEX_SIMD_66, vector256);
3172 }
3173 
3174 void Assembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3175   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3176   emit_vex_arith(0xF9, dst, nds, src, VEX_SIMD_66, vector256);
3177 }
3178 
3179 void Assembler::vpsubd(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3180   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3181   emit_vex_arith(0xFA, dst, nds, src, VEX_SIMD_66, vector256);
3182 }
3183 
3184 void Assembler::vpsubq(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3185   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3186   emit_vex_arith(0xFB, dst, nds, src, VEX_SIMD_66, vector256);
3187 }
3188 
3189 void Assembler::pmullw(XMMRegister dst, XMMRegister src) {
3190   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3191   emit_simd_arith(0xD5, dst, src, VEX_SIMD_66);
3192 }
3193 
3194 void Assembler::pmulld(XMMRegister dst, XMMRegister src) {
3195   assert(VM_Version::supports_sse4_1(), "");
3196   int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
3197   emit_byte(0x40);
3198   emit_byte(0xC0 | encode);
3199 }
3200 
3201 void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3202   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3203   emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector256);
3204 }
3205 
3206 void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3207   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3208   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38);
3209   emit_byte(0x40);
3210   emit_byte(0xC0 | encode);
3211 }
3212 
3213 void Assembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3214   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3215   emit_vex_arith(0xD5, dst, nds, src, VEX_SIMD_66, vector256);
3216 }
3217 
3218 void Assembler::vpmulld(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3219   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3220   InstructionMark im(this);
3221   int dst_enc = dst->encoding();
3222   int nds_enc = nds->is_valid() ? nds->encoding() : 0;
3223   vex_prefix(src, nds_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector256);
3224   emit_byte(0x40);
3225   emit_operand(dst, src);
3226 }
3227 
3228 // Shift packed integers left by specified number of bits.
3229 void Assembler::psllw(XMMRegister dst, int shift) {
3230   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3231   // XMM6 is for /6 encoding: 66 0F 71 /6 ib
3232   int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66);
3233   emit_byte(0x71);
3234   emit_byte(0xC0 | encode);
3235   emit_byte(shift);
3236 }
3237 
3238 void Assembler::pslld(XMMRegister dst, int shift) {
3239   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3240   // XMM6 is for /6 encoding: 66 0F 72 /6 ib
3241   int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66);
3242   emit_byte(0x72);
3243   emit_byte(0xC0 | encode);
3244   emit_byte(shift);
3245 }
3246 
3247 void Assembler::psllq(XMMRegister dst, int shift) {
3248   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3249   // XMM6 is for /6 encoding: 66 0F 73 /6 ib
3250   int encode = simd_prefix_and_encode(xmm6, dst, dst, VEX_SIMD_66);
3251   emit_byte(0x73);
3252   emit_byte(0xC0 | encode);
3253   emit_byte(shift);
3254 }
3255 
3256 void Assembler::psllw(XMMRegister dst, XMMRegister shift) {
3257   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3258   emit_simd_arith(0xF1, dst, shift, VEX_SIMD_66);
3259 }
3260 
3261 void Assembler::pslld(XMMRegister dst, XMMRegister shift) {
3262   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3263   emit_simd_arith(0xF2, dst, shift, VEX_SIMD_66);
3264 }
3265 
3266 void Assembler::psllq(XMMRegister dst, XMMRegister shift) {
3267   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3268   emit_simd_arith(0xF3, dst, shift, VEX_SIMD_66);
3269 }
3270 
3271 void Assembler::vpsllw(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
3272   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3273   // XMM6 is for /6 encoding: 66 0F 71 /6 ib
3274   emit_vex_arith(0x71, xmm6, dst, src, VEX_SIMD_66, vector256);
3275   emit_byte(shift);
3276 }
3277 
3278 void Assembler::vpslld(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
3279   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3280   // XMM6 is for /6 encoding: 66 0F 72 /6 ib
3281   emit_vex_arith(0x72, xmm6, dst, src, VEX_SIMD_66, vector256);
3282   emit_byte(shift);
3283 }
3284 
3285 void Assembler::vpsllq(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
3286   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3287   // XMM6 is for /6 encoding: 66 0F 73 /6 ib
3288   emit_vex_arith(0x73, xmm6, dst, src, VEX_SIMD_66, vector256);
3289   emit_byte(shift);
3290 }
3291 
3292 void Assembler::vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
3293   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3294   emit_vex_arith(0xF1, dst, src, shift, VEX_SIMD_66, vector256);
3295 }
3296 
3297 void Assembler::vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
3298   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3299   emit_vex_arith(0xF2, dst, src, shift, VEX_SIMD_66, vector256);
3300 }
3301 
3302 void Assembler::vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
3303   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3304   emit_vex_arith(0xF3, dst, src, shift, VEX_SIMD_66, vector256);
3305 }
3306 
3307 // Shift packed integers logically right by specified number of bits.
3308 void Assembler::psrlw(XMMRegister dst, int shift) {
3309   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3310   // XMM2 is for /2 encoding: 66 0F 71 /2 ib
3311   int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66);
3312   emit_byte(0x71);
3313   emit_byte(0xC0 | encode);
3314   emit_byte(shift);
3315 }
3316 
3317 void Assembler::psrld(XMMRegister dst, int shift) {
3318   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3319   // XMM2 is for /2 encoding: 66 0F 72 /2 ib
3320   int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66);
3321   emit_byte(0x72);
3322   emit_byte(0xC0 | encode);
3323   emit_byte(shift);
3324 }
3325 
3326 void Assembler::psrlq(XMMRegister dst, int shift) {
3327   // Do not confuse it with psrldq SSE2 instruction which
3328   // shifts 128 bit value in xmm register by number of bytes.
3329   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3330   // XMM2 is for /2 encoding: 66 0F 73 /2 ib
3331   int encode = simd_prefix_and_encode(xmm2, dst, dst, VEX_SIMD_66);
3332   emit_byte(0x73);
3333   emit_byte(0xC0 | encode);
3334   emit_byte(shift);
3335 }
3336 
3337 void Assembler::psrlw(XMMRegister dst, XMMRegister shift) {
3338   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3339   emit_simd_arith(0xD1, dst, shift, VEX_SIMD_66);
3340 }
3341 
3342 void Assembler::psrld(XMMRegister dst, XMMRegister shift) {
3343   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3344   emit_simd_arith(0xD2, dst, shift, VEX_SIMD_66);
3345 }
3346 
3347 void Assembler::psrlq(XMMRegister dst, XMMRegister shift) {
3348   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3349   emit_simd_arith(0xD3, dst, shift, VEX_SIMD_66);
3350 }
3351 
3352 void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
3353   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3354   // XMM2 is for /2 encoding: 66 0F 73 /2 ib
3355   emit_vex_arith(0x71, xmm2, dst, src, VEX_SIMD_66, vector256);
3356   emit_byte(shift);
3357 }
3358 
3359 void Assembler::vpsrld(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
3360   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3361   // XMM2 is for /2 encoding: 66 0F 73 /2 ib
3362   emit_vex_arith(0x72, xmm2, dst, src, VEX_SIMD_66, vector256);
3363   emit_byte(shift);
3364 }
3365 
3366 void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
3367   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3368   // XMM2 is for /2 encoding: 66 0F 73 /2 ib
3369   emit_vex_arith(0x73, xmm2, dst, src, VEX_SIMD_66, vector256);
3370   emit_byte(shift);
3371 }
3372 
3373 void Assembler::vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
3374   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3375   emit_vex_arith(0xD1, dst, src, shift, VEX_SIMD_66, vector256);
3376 }
3377 
3378 void Assembler::vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
3379   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3380   emit_vex_arith(0xD2, dst, src, shift, VEX_SIMD_66, vector256);
3381 }
3382 
3383 void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
3384   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3385   emit_vex_arith(0xD3, dst, src, shift, VEX_SIMD_66, vector256);
3386 }
3387 
3388 // Shift packed integers arithmetically right by specified number of bits.
3389 void Assembler::psraw(XMMRegister dst, int shift) {
3390   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3391   // XMM4 is for /4 encoding: 66 0F 71 /4 ib
3392   int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66);
3393   emit_byte(0x71);
3394   emit_byte(0xC0 | encode);
3395   emit_byte(shift);
3396 }
3397 
3398 void Assembler::psrad(XMMRegister dst, int shift) {
3399   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3400   // XMM4 is for /4 encoding: 66 0F 72 /4 ib
3401   int encode = simd_prefix_and_encode(xmm4, dst, dst, VEX_SIMD_66);
3402   emit_byte(0x72);
3403   emit_byte(0xC0 | encode);
3404   emit_byte(shift);
3405 }
3406 
3407 void Assembler::psraw(XMMRegister dst, XMMRegister shift) {
3408   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3409   emit_simd_arith(0xE1, dst, shift, VEX_SIMD_66);
3410 }
3411 
3412 void Assembler::psrad(XMMRegister dst, XMMRegister shift) {
3413   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3414   emit_simd_arith(0xE2, dst, shift, VEX_SIMD_66);
3415 }
3416 
3417 void Assembler::vpsraw(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
3418   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3419   // XMM4 is for /4 encoding: 66 0F 71 /4 ib
3420   emit_vex_arith(0x71, xmm4, dst, src, VEX_SIMD_66, vector256);
3421   emit_byte(shift);
3422 }
3423 
3424 void Assembler::vpsrad(XMMRegister dst, XMMRegister src, int shift, bool vector256) {
3425   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3426   // XMM4 is for /4 encoding: 66 0F 71 /4 ib
3427   emit_vex_arith(0x72, xmm4, dst, src, VEX_SIMD_66, vector256);
3428   emit_byte(shift);
3429 }
3430 
3431 void Assembler::vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
3432   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3433   emit_vex_arith(0xE1, dst, src, shift, VEX_SIMD_66, vector256);
3434 }
3435 
3436 void Assembler::vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256) {
3437   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3438   emit_vex_arith(0xE2, dst, src, shift, VEX_SIMD_66, vector256);
3439 }
3440 
3441 
3442 // AND packed integers
3443 void Assembler::pand(XMMRegister dst, XMMRegister src) {
3444   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3445   emit_simd_arith(0xDB, dst, src, VEX_SIMD_66);
3446 }
3447 
3448 void Assembler::vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3449   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3450   emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector256);
3451 }
3452 
3453 void Assembler::vpand(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3454   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3455   emit_vex_arith(0xDB, dst, nds, src, VEX_SIMD_66, vector256);
3456 }
3457 
3458 void Assembler::por(XMMRegister dst, XMMRegister src) {
3459   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3460   emit_simd_arith(0xEB, dst, src, VEX_SIMD_66);
3461 }
3462 
3463 void Assembler::vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3464   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3465   emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector256);
3466 }
3467 
3468 void Assembler::vpor(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3469   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3470   emit_vex_arith(0xEB, dst, nds, src, VEX_SIMD_66, vector256);
3471 }
3472 
3473 void Assembler::pxor(XMMRegister dst, XMMRegister src) {
3474   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
3475   emit_simd_arith(0xEF, dst, src, VEX_SIMD_66);
3476 }
3477 
3478 void Assembler::vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
3479   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3480   emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector256);
3481 }
3482 
3483 void Assembler::vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256) {
3484   assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
3485   emit_vex_arith(0xEF, dst, nds, src, VEX_SIMD_66, vector256);
3486 }
3487 
3488 
3489 void Assembler::vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3490   assert(VM_Version::supports_avx(), "");
3491   bool vector256 = true;
3492   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A);
3493   emit_byte(0x18);
3494   emit_byte(0xC0 | encode);
3495   // 0x00 - insert into lower 128 bits
3496   // 0x01 - insert into upper 128 bits
3497   emit_byte(0x01);
3498 }
3499 
3500 void Assembler::vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
3501   assert(VM_Version::supports_avx2(), "");
3502   bool vector256 = true;
3503   int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A);
3504   emit_byte(0x38);
3505   emit_byte(0xC0 | encode);
3506   // 0x00 - insert into lower 128 bits
3507   // 0x01 - insert into upper 128 bits
3508   emit_byte(0x01);
3509 }
3510 
3511 void Assembler::vzeroupper() {
3512   assert(VM_Version::supports_avx(), "");
3513   (void)vex_prefix_and_encode(xmm0, xmm0, xmm0, VEX_SIMD_NONE);
3514   emit_byte(0x77);
3515 }
3516 
3517 
3518 #ifndef _LP64
3519 // 32bit only pieces of the assembler
3520 
3521 void Assembler::cmp_literal32(Register src1, int32_t imm32, RelocationHolder const& rspec) {
3522   // NO PREFIX AS NEVER 64BIT
3523   InstructionMark im(this);
3524   emit_byte(0x81);
3525   emit_byte(0xF8 | src1->encoding());
3526   emit_data(imm32, rspec, 0);
3527 }
3528 
3529 void Assembler::cmp_literal32(Address src1, int32_t imm32, RelocationHolder const& rspec) {
3530   // NO PREFIX AS NEVER 64BIT (not even 32bit versions of 64bit regs
3531   InstructionMark im(this);
3532   emit_byte(0x81);
3533   emit_operand(rdi, src1);
3534   emit_data(imm32, rspec, 0);
3535 }
3536 
3537 // The 64-bit (32bit platform) cmpxchg compares the value at adr with the contents of rdx:rax,
3538 // and stores rcx:rbx into adr if so; otherwise, the value at adr is loaded
3539 // into rdx:rax.  The ZF is set if the compared values were equal, and cleared otherwise.
3540 void Assembler::cmpxchg8(Address adr) {
3541   InstructionMark im(this);
3542   emit_byte(0x0F);
3543   emit_byte(0xc7);
3544   emit_operand(rcx, adr);
3545 }
3546 
3547 void Assembler::decl(Register dst) {
3548   // Don't use it directly. Use MacroAssembler::decrementl() instead.
3549  emit_byte(0x48 | dst->encoding());
3550 }
3551 
3552 #endif // _LP64
3553 
3554 // 64bit typically doesn't use the x87 but needs to for the trig funcs
3555 
3556 void Assembler::fabs() {
3557   emit_byte(0xD9);
3558   emit_byte(0xE1);
3559 }
3560 
3561 void Assembler::fadd(int i) {
3562   emit_farith(0xD8, 0xC0, i);
3563 }
3564 
3565 void Assembler::fadd_d(Address src) {
3566   InstructionMark im(this);
3567   emit_byte(0xDC);
3568   emit_operand32(rax, src);
3569 }
3570 
3571 void Assembler::fadd_s(Address src) {
3572   InstructionMark im(this);
3573   emit_byte(0xD8);
3574   emit_operand32(rax, src);
3575 }
3576 
3577 void Assembler::fadda(int i) {
3578   emit_farith(0xDC, 0xC0, i);
3579 }
3580 
3581 void Assembler::faddp(int i) {
3582   emit_farith(0xDE, 0xC0, i);
3583 }
3584 
3585 void Assembler::fchs() {
3586   emit_byte(0xD9);
3587   emit_byte(0xE0);
3588 }
3589 
3590 void Assembler::fcom(int i) {
3591   emit_farith(0xD8, 0xD0, i);
3592 }
3593 
3594 void Assembler::fcomp(int i) {
3595   emit_farith(0xD8, 0xD8, i);
3596 }
3597 
3598 void Assembler::fcomp_d(Address src) {
3599   InstructionMark im(this);
3600   emit_byte(0xDC);
3601   emit_operand32(rbx, src);
3602 }
3603 
3604 void Assembler::fcomp_s(Address src) {
3605   InstructionMark im(this);
3606   emit_byte(0xD8);
3607   emit_operand32(rbx, src);
3608 }
3609 
3610 void Assembler::fcompp() {
3611   emit_byte(0xDE);
3612   emit_byte(0xD9);
3613 }
3614 
3615 void Assembler::fcos() {
3616   emit_byte(0xD9);
3617   emit_byte(0xFF);
3618 }
3619 
3620 void Assembler::fdecstp() {
3621   emit_byte(0xD9);
3622   emit_byte(0xF6);
3623 }
3624 
3625 void Assembler::fdiv(int i) {
3626   emit_farith(0xD8, 0xF0, i);
3627 }
3628 
3629 void Assembler::fdiv_d(Address src) {
3630   InstructionMark im(this);
3631   emit_byte(0xDC);
3632   emit_operand32(rsi, src);
3633 }
3634 
3635 void Assembler::fdiv_s(Address src) {
3636   InstructionMark im(this);
3637   emit_byte(0xD8);
3638   emit_operand32(rsi, src);
3639 }
3640 
3641 void Assembler::fdiva(int i) {
3642   emit_farith(0xDC, 0xF8, i);
3643 }
3644 
3645 // Note: The Intel manual (Pentium Processor User's Manual, Vol.3, 1994)
3646 //       is erroneous for some of the floating-point instructions below.
3647 
3648 void Assembler::fdivp(int i) {
3649   emit_farith(0xDE, 0xF8, i);                    // ST(0) <- ST(0) / ST(1) and pop (Intel manual wrong)
3650 }
3651 
3652 void Assembler::fdivr(int i) {
3653   emit_farith(0xD8, 0xF8, i);
3654 }
3655 
3656 void Assembler::fdivr_d(Address src) {
3657   InstructionMark im(this);
3658   emit_byte(0xDC);
3659   emit_operand32(rdi, src);
3660 }
3661 
3662 void Assembler::fdivr_s(Address src) {
3663   InstructionMark im(this);
3664   emit_byte(0xD8);
3665   emit_operand32(rdi, src);
3666 }
3667 
3668 void Assembler::fdivra(int i) {
3669   emit_farith(0xDC, 0xF0, i);
3670 }
3671 
3672 void Assembler::fdivrp(int i) {
3673   emit_farith(0xDE, 0xF0, i);                    // ST(0) <- ST(1) / ST(0) and pop (Intel manual wrong)
3674 }
3675 
3676 void Assembler::ffree(int i) {
3677   emit_farith(0xDD, 0xC0, i);
3678 }
3679 
3680 void Assembler::fild_d(Address adr) {
3681   InstructionMark im(this);
3682   emit_byte(0xDF);
3683   emit_operand32(rbp, adr);
3684 }
3685 
3686 void Assembler::fild_s(Address adr) {
3687   InstructionMark im(this);
3688   emit_byte(0xDB);
3689   emit_operand32(rax, adr);
3690 }
3691 
3692 void Assembler::fincstp() {
3693   emit_byte(0xD9);
3694   emit_byte(0xF7);
3695 }
3696 
3697 void Assembler::finit() {
3698   emit_byte(0x9B);
3699   emit_byte(0xDB);
3700   emit_byte(0xE3);
3701 }
3702 
3703 void Assembler::fist_s(Address adr) {
3704   InstructionMark im(this);
3705   emit_byte(0xDB);
3706   emit_operand32(rdx, adr);
3707 }
3708 
3709 void Assembler::fistp_d(Address adr) {
3710   InstructionMark im(this);
3711   emit_byte(0xDF);
3712   emit_operand32(rdi, adr);
3713 }
3714 
3715 void Assembler::fistp_s(Address adr) {
3716   InstructionMark im(this);
3717   emit_byte(0xDB);
3718   emit_operand32(rbx, adr);
3719 }
3720 
3721 void Assembler::fld1() {
3722   emit_byte(0xD9);
3723   emit_byte(0xE8);
3724 }
3725 
3726 void Assembler::fld_d(Address adr) {
3727   InstructionMark im(this);
3728   emit_byte(0xDD);
3729   emit_operand32(rax, adr);
3730 }
3731 
3732 void Assembler::fld_s(Address adr) {
3733   InstructionMark im(this);
3734   emit_byte(0xD9);
3735   emit_operand32(rax, adr);
3736 }
3737 
3738 
3739 void Assembler::fld_s(int index) {
3740   emit_farith(0xD9, 0xC0, index);
3741 }
3742 
3743 void Assembler::fld_x(Address adr) {
3744   InstructionMark im(this);
3745   emit_byte(0xDB);
3746   emit_operand32(rbp, adr);
3747 }
3748 
3749 void Assembler::fldcw(Address src) {
3750   InstructionMark im(this);
3751   emit_byte(0xd9);
3752   emit_operand32(rbp, src);
3753 }
3754 
3755 void Assembler::fldenv(Address src) {
3756   InstructionMark im(this);
3757   emit_byte(0xD9);
3758   emit_operand32(rsp, src);
3759 }
3760 
3761 void Assembler::fldlg2() {
3762   emit_byte(0xD9);
3763   emit_byte(0xEC);
3764 }
3765 
3766 void Assembler::fldln2() {
3767   emit_byte(0xD9);
3768   emit_byte(0xED);
3769 }
3770 
3771 void Assembler::fldz() {
3772   emit_byte(0xD9);
3773   emit_byte(0xEE);
3774 }
3775 
3776 void Assembler::flog() {
3777   fldln2();
3778   fxch();
3779   fyl2x();
3780 }
3781 
3782 void Assembler::flog10() {
3783   fldlg2();
3784   fxch();
3785   fyl2x();
3786 }
3787 
3788 void Assembler::fmul(int i) {
3789   emit_farith(0xD8, 0xC8, i);
3790 }
3791 
3792 void Assembler::fmul_d(Address src) {
3793   InstructionMark im(this);
3794   emit_byte(0xDC);
3795   emit_operand32(rcx, src);
3796 }
3797 
3798 void Assembler::fmul_s(Address src) {
3799   InstructionMark im(this);
3800   emit_byte(0xD8);
3801   emit_operand32(rcx, src);
3802 }
3803 
3804 void Assembler::fmula(int i) {
3805   emit_farith(0xDC, 0xC8, i);
3806 }
3807 
3808 void Assembler::fmulp(int i) {
3809   emit_farith(0xDE, 0xC8, i);
3810 }
3811 
3812 void Assembler::fnsave(Address dst) {
3813   InstructionMark im(this);
3814   emit_byte(0xDD);
3815   emit_operand32(rsi, dst);
3816 }
3817 
3818 void Assembler::fnstcw(Address src) {
3819   InstructionMark im(this);
3820   emit_byte(0x9B);
3821   emit_byte(0xD9);
3822   emit_operand32(rdi, src);
3823 }
3824 
3825 void Assembler::fnstsw_ax() {
3826   emit_byte(0xdF);
3827   emit_byte(0xE0);
3828 }
3829 
3830 void Assembler::fprem() {
3831   emit_byte(0xD9);
3832   emit_byte(0xF8);
3833 }
3834 
3835 void Assembler::fprem1() {
3836   emit_byte(0xD9);
3837   emit_byte(0xF5);
3838 }
3839 
3840 void Assembler::frstor(Address src) {
3841   InstructionMark im(this);
3842   emit_byte(0xDD);
3843   emit_operand32(rsp, src);
3844 }
3845 
3846 void Assembler::fsin() {
3847   emit_byte(0xD9);
3848   emit_byte(0xFE);
3849 }
3850 
3851 void Assembler::fsqrt() {
3852   emit_byte(0xD9);
3853   emit_byte(0xFA);
3854 }
3855 
3856 void Assembler::fst_d(Address adr) {
3857   InstructionMark im(this);
3858   emit_byte(0xDD);
3859   emit_operand32(rdx, adr);
3860 }
3861 
3862 void Assembler::fst_s(Address adr) {
3863   InstructionMark im(this);
3864   emit_byte(0xD9);
3865   emit_operand32(rdx, adr);
3866 }
3867 
3868 void Assembler::fstp_d(Address adr) {
3869   InstructionMark im(this);
3870   emit_byte(0xDD);
3871   emit_operand32(rbx, adr);
3872 }
3873 
3874 void Assembler::fstp_d(int index) {
3875   emit_farith(0xDD, 0xD8, index);
3876 }
3877 
3878 void Assembler::fstp_s(Address adr) {
3879   InstructionMark im(this);
3880   emit_byte(0xD9);
3881   emit_operand32(rbx, adr);
3882 }
3883 
3884 void Assembler::fstp_x(Address adr) {
3885   InstructionMark im(this);
3886   emit_byte(0xDB);
3887   emit_operand32(rdi, adr);
3888 }
3889 
3890 void Assembler::fsub(int i) {
3891   emit_farith(0xD8, 0xE0, i);
3892 }
3893 
3894 void Assembler::fsub_d(Address src) {
3895   InstructionMark im(this);
3896   emit_byte(0xDC);
3897   emit_operand32(rsp, src);
3898 }
3899 
3900 void Assembler::fsub_s(Address src) {
3901   InstructionMark im(this);
3902   emit_byte(0xD8);
3903   emit_operand32(rsp, src);
3904 }
3905 
3906 void Assembler::fsuba(int i) {
3907   emit_farith(0xDC, 0xE8, i);
3908 }
3909 
3910 void Assembler::fsubp(int i) {
3911   emit_farith(0xDE, 0xE8, i);                    // ST(0) <- ST(0) - ST(1) and pop (Intel manual wrong)
3912 }
3913 
3914 void Assembler::fsubr(int i) {
3915   emit_farith(0xD8, 0xE8, i);
3916 }
3917 
3918 void Assembler::fsubr_d(Address src) {
3919   InstructionMark im(this);
3920   emit_byte(0xDC);
3921   emit_operand32(rbp, src);
3922 }
3923 
3924 void Assembler::fsubr_s(Address src) {
3925   InstructionMark im(this);
3926   emit_byte(0xD8);
3927   emit_operand32(rbp, src);
3928 }
3929 
3930 void Assembler::fsubra(int i) {
3931   emit_farith(0xDC, 0xE0, i);
3932 }
3933 
3934 void Assembler::fsubrp(int i) {
3935   emit_farith(0xDE, 0xE0, i);                    // ST(0) <- ST(1) - ST(0) and pop (Intel manual wrong)
3936 }
3937 
3938 void Assembler::ftan() {
3939   emit_byte(0xD9);
3940   emit_byte(0xF2);
3941   emit_byte(0xDD);
3942   emit_byte(0xD8);
3943 }
3944 
3945 void Assembler::ftst() {
3946   emit_byte(0xD9);
3947   emit_byte(0xE4);
3948 }
3949 
3950 void Assembler::fucomi(int i) {
3951   // make sure the instruction is supported (introduced for P6, together with cmov)
3952   guarantee(VM_Version::supports_cmov(), "illegal instruction");
3953   emit_farith(0xDB, 0xE8, i);
3954 }
3955 
3956 void Assembler::fucomip(int i) {
3957   // make sure the instruction is supported (introduced for P6, together with cmov)
3958   guarantee(VM_Version::supports_cmov(), "illegal instruction");
3959   emit_farith(0xDF, 0xE8, i);
3960 }
3961 
3962 void Assembler::fwait() {
3963   emit_byte(0x9B);
3964 }
3965 
3966 void Assembler::fxch(int i) {
3967   emit_farith(0xD9, 0xC8, i);
3968 }
3969 
3970 void Assembler::fyl2x() {
3971   emit_byte(0xD9);
3972   emit_byte(0xF1);
3973 }
3974 
3975 void Assembler::frndint() {
3976   emit_byte(0xD9);
3977   emit_byte(0xFC);
3978 }
3979 
3980 void Assembler::f2xm1() {
3981   emit_byte(0xD9);
3982   emit_byte(0xF0);
3983 }
3984 
3985 void Assembler::fldl2e() {
3986   emit_byte(0xD9);
3987   emit_byte(0xEA);
3988 }
3989 
3990 // SSE SIMD prefix byte values corresponding to VexSimdPrefix encoding.
3991 static int simd_pre[4] = { 0, 0x66, 0xF3, 0xF2 };
3992 // SSE opcode second byte values (first is 0x0F) corresponding to VexOpcode encoding.
3993 static int simd_opc[4] = { 0,    0, 0x38, 0x3A };
3994 
3995 // Generate SSE legacy REX prefix and SIMD opcode based on VEX encoding.
3996 void Assembler::rex_prefix(Address adr, XMMRegister xreg, VexSimdPrefix pre, VexOpcode opc, bool rex_w) {
3997   if (pre > 0) {
3998     emit_byte(simd_pre[pre]);
3999   }
4000   if (rex_w) {
4001     prefixq(adr, xreg);
4002   } else {
4003     prefix(adr, xreg);
4004   }
4005   if (opc > 0) {
4006     emit_byte(0x0F);
4007     int opc2 = simd_opc[opc];
4008     if (opc2 > 0) {
4009       emit_byte(opc2);
4010     }
4011   }
4012 }
4013 
4014 int Assembler::rex_prefix_and_encode(int dst_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc, bool rex_w) {
4015   if (pre > 0) {
4016     emit_byte(simd_pre[pre]);
4017   }
4018   int encode = (rex_w) ? prefixq_and_encode(dst_enc, src_enc) :
4019                           prefix_and_encode(dst_enc, src_enc);
4020   if (opc > 0) {
4021     emit_byte(0x0F);
4022     int opc2 = simd_opc[opc];
4023     if (opc2 > 0) {
4024       emit_byte(opc2);
4025     }
4026   }
4027   return encode;
4028 }
4029 
4030 
4031 void Assembler::vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w, int nds_enc, VexSimdPrefix pre, VexOpcode opc, bool vector256) {
4032   if (vex_b || vex_x || vex_w || (opc == VEX_OPCODE_0F_38) || (opc == VEX_OPCODE_0F_3A)) {
4033     prefix(VEX_3bytes);
4034 
4035     int byte1 = (vex_r ? VEX_R : 0) | (vex_x ? VEX_X : 0) | (vex_b ? VEX_B : 0);
4036     byte1 = (~byte1) & 0xE0;
4037     byte1 |= opc;
4038     a_byte(byte1);
4039 
4040     int byte2 = ((~nds_enc) & 0xf) << 3;
4041     byte2 |= (vex_w ? VEX_W : 0) | (vector256 ? 4 : 0) | pre;
4042     emit_byte(byte2);
4043   } else {
4044     prefix(VEX_2bytes);
4045 
4046     int byte1 = vex_r ? VEX_R : 0;
4047     byte1 = (~byte1) & 0x80;
4048     byte1 |= ((~nds_enc) & 0xf) << 3;
4049     byte1 |= (vector256 ? 4 : 0) | pre;
4050     emit_byte(byte1);
4051   }
4052 }
4053 
4054 void Assembler::vex_prefix(Address adr, int nds_enc, int xreg_enc, VexSimdPrefix pre, VexOpcode opc, bool vex_w, bool vector256){
4055   bool vex_r = (xreg_enc >= 8);
4056   bool vex_b = adr.base_needs_rex();
4057   bool vex_x = adr.index_needs_rex();
4058   vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector256);
4059 }
4060 
4061 int Assembler::vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc, bool vex_w, bool vector256) {
4062   bool vex_r = (dst_enc >= 8);
4063   bool vex_b = (src_enc >= 8);
4064   bool vex_x = false;
4065   vex_prefix(vex_r, vex_b, vex_x, vex_w, nds_enc, pre, opc, vector256);
4066   return (((dst_enc & 7) << 3) | (src_enc & 7));
4067 }
4068 
4069 
4070 void Assembler::simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr, VexSimdPrefix pre, VexOpcode opc, bool rex_w, bool vector256) {
4071   if (UseAVX > 0) {
4072     int xreg_enc = xreg->encoding();
4073     int  nds_enc = nds->is_valid() ? nds->encoding() : 0;
4074     vex_prefix(adr, nds_enc, xreg_enc, pre, opc, rex_w, vector256);
4075   } else {
4076     assert((nds == xreg) || (nds == xnoreg), "wrong sse encoding");
4077     rex_prefix(adr, xreg, pre, opc, rex_w);
4078   }
4079 }
4080 
4081 int Assembler::simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, VexSimdPrefix pre, VexOpcode opc, bool rex_w, bool vector256) {
4082   int dst_enc = dst->encoding();
4083   int src_enc = src->encoding();
4084   if (UseAVX > 0) {
4085     int nds_enc = nds->is_valid() ? nds->encoding() : 0;
4086     return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, rex_w, vector256);
4087   } else {
4088     assert((nds == dst) || (nds == src) || (nds == xnoreg), "wrong sse encoding");
4089     return rex_prefix_and_encode(dst_enc, src_enc, pre, opc, rex_w);
4090   }
4091 }
4092 
4093 void Assembler::emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre) {
4094   InstructionMark im(this);
4095   simd_prefix(dst, dst, src, pre);
4096   emit_byte(opcode);
4097   emit_operand(dst, src);
4098 }
4099 
4100 void Assembler::emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre) {
4101   int encode = simd_prefix_and_encode(dst, dst, src, pre);
4102   emit_byte(opcode);
4103   emit_byte(0xC0 | encode);
4104 }
4105 
4106 // Versions with no second source register (non-destructive source).
4107 void Assembler::emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre) {
4108   InstructionMark im(this);
4109   simd_prefix(dst, xnoreg, src, pre);
4110   emit_byte(opcode);
4111   emit_operand(dst, src);
4112 }
4113 
4114 void Assembler::emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre) {
4115   int encode = simd_prefix_and_encode(dst, xnoreg, src, pre);
4116   emit_byte(opcode);
4117   emit_byte(0xC0 | encode);
4118 }
4119 
4120 // 3-operands AVX instructions
4121 void Assembler::emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
4122                                Address src, VexSimdPrefix pre, bool vector256) {
4123   InstructionMark im(this);
4124   vex_prefix(dst, nds, src, pre, vector256);
4125   emit_byte(opcode);
4126   emit_operand(dst, src);
4127 }
4128 
4129 void Assembler::emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
4130                                XMMRegister src, VexSimdPrefix pre, bool vector256) {
4131   int encode = vex_prefix_and_encode(dst, nds, src, pre, vector256);
4132   emit_byte(opcode);
4133   emit_byte(0xC0 | encode);
4134 }
4135 
4136 #ifndef _LP64
4137 
4138 void Assembler::incl(Register dst) {
4139   // Don't use it directly. Use MacroAssembler::incrementl() instead.
4140   emit_byte(0x40 | dst->encoding());
4141 }
4142 
4143 void Assembler::lea(Register dst, Address src) {
4144   leal(dst, src);
4145 }
4146 
4147 void Assembler::mov_literal32(Address dst, int32_t imm32,  RelocationHolder const& rspec) {
4148   InstructionMark im(this);
4149   emit_byte(0xC7);
4150   emit_operand(rax, dst);
4151   emit_data((int)imm32, rspec, 0);
4152 }
4153 
4154 void Assembler::mov_literal32(Register dst, int32_t imm32, RelocationHolder const& rspec) {
4155   InstructionMark im(this);
4156   int encode = prefix_and_encode(dst->encoding());
4157   emit_byte(0xB8 | encode);
4158   emit_data((int)imm32, rspec, 0);
4159 }
4160 
4161 void Assembler::popa() { // 32bit
4162   emit_byte(0x61);
4163 }
4164 
4165 void Assembler::push_literal32(int32_t imm32, RelocationHolder const& rspec) {
4166   InstructionMark im(this);
4167   emit_byte(0x68);
4168   emit_data(imm32, rspec, 0);
4169 }
4170 
4171 void Assembler::pusha() { // 32bit
4172   emit_byte(0x60);
4173 }
4174 
4175 void Assembler::set_byte_if_not_zero(Register dst) {
4176   emit_byte(0x0F);
4177   emit_byte(0x95);
4178   emit_byte(0xE0 | dst->encoding());
4179 }
4180 
4181 void Assembler::shldl(Register dst, Register src) {
4182   emit_byte(0x0F);
4183   emit_byte(0xA5);
4184   emit_byte(0xC0 | src->encoding() << 3 | dst->encoding());
4185 }
4186 
4187 void Assembler::shrdl(Register dst, Register src) {
4188   emit_byte(0x0F);
4189   emit_byte(0xAD);
4190   emit_byte(0xC0 | src->encoding() << 3 | dst->encoding());
4191 }
4192 
4193 #else // LP64
4194 
4195 void Assembler::set_byte_if_not_zero(Register dst) {
4196   int enc = prefix_and_encode(dst->encoding(), true);
4197   emit_byte(0x0F);
4198   emit_byte(0x95);
4199   emit_byte(0xE0 | enc);
4200 }
4201 
4202 // 64bit only pieces of the assembler
4203 // This should only be used by 64bit instructions that can use rip-relative
4204 // it cannot be used by instructions that want an immediate value.
4205 
4206 bool Assembler::reachable(AddressLiteral adr) {
4207   int64_t disp;
4208   // None will force a 64bit literal to the code stream. Likely a placeholder
4209   // for something that will be patched later and we need to certain it will
4210   // always be reachable.
4211   if (adr.reloc() == relocInfo::none) {
4212     return false;
4213   }
4214   if (adr.reloc() == relocInfo::internal_word_type) {
4215     // This should be rip relative and easily reachable.
4216     return true;
4217   }
4218   if (adr.reloc() == relocInfo::virtual_call_type ||
4219       adr.reloc() == relocInfo::opt_virtual_call_type ||
4220       adr.reloc() == relocInfo::static_call_type ||
4221       adr.reloc() == relocInfo::static_stub_type ) {
4222     // This should be rip relative within the code cache and easily
4223     // reachable until we get huge code caches. (At which point
4224     // ic code is going to have issues).
4225     return true;
4226   }
4227   if (adr.reloc() != relocInfo::external_word_type &&
4228       adr.reloc() != relocInfo::poll_return_type &&  // these are really external_word but need special
4229       adr.reloc() != relocInfo::poll_type &&         // relocs to identify them
4230       adr.reloc() != relocInfo::runtime_call_type ) {
4231     return false;
4232   }
4233 
4234   // Stress the correction code
4235   if (ForceUnreachable) {
4236     // Must be runtimecall reloc, see if it is in the codecache
4237     // Flipping stuff in the codecache to be unreachable causes issues
4238     // with things like inline caches where the additional instructions
4239     // are not handled.
4240     if (CodeCache::find_blob(adr._target) == NULL) {
4241       return false;
4242     }
4243   }
4244   // For external_word_type/runtime_call_type if it is reachable from where we
4245   // are now (possibly a temp buffer) and where we might end up
4246   // anywhere in the codeCache then we are always reachable.
4247   // This would have to change if we ever save/restore shared code
4248   // to be more pessimistic.
4249   disp = (int64_t)adr._target - ((int64_t)CodeCache::low_bound() + sizeof(int));
4250   if (!is_simm32(disp)) return false;
4251   disp = (int64_t)adr._target - ((int64_t)CodeCache::high_bound() + sizeof(int));
4252   if (!is_simm32(disp)) return false;
4253 
4254   disp = (int64_t)adr._target - ((int64_t)_code_pos + sizeof(int));
4255 
4256   // Because rip relative is a disp + address_of_next_instruction and we
4257   // don't know the value of address_of_next_instruction we apply a fudge factor
4258   // to make sure we will be ok no matter the size of the instruction we get placed into.
4259   // We don't have to fudge the checks above here because they are already worst case.
4260 
4261   // 12 == override/rex byte, opcode byte, rm byte, sib byte, a 4-byte disp , 4-byte literal
4262   // + 4 because better safe than sorry.
4263   const int fudge = 12 + 4;
4264   if (disp < 0) {
4265     disp -= fudge;
4266   } else {
4267     disp += fudge;
4268   }
4269   return is_simm32(disp);
4270 }
4271 
4272 // Check if the polling page is not reachable from the code cache using rip-relative
4273 // addressing.
4274 bool Assembler::is_polling_page_far() {
4275   intptr_t addr = (intptr_t)os::get_polling_page();
4276   return ForceUnreachable ||
4277          !is_simm32(addr - (intptr_t)CodeCache::low_bound()) ||
4278          !is_simm32(addr - (intptr_t)CodeCache::high_bound());
4279 }
4280 
4281 void Assembler::emit_data64(jlong data,
4282                             relocInfo::relocType rtype,
4283                             int format) {
4284   if (rtype == relocInfo::none) {
4285     emit_long64(data);
4286   } else {
4287     emit_data64(data, Relocation::spec_simple(rtype), format);
4288   }
4289 }
4290 
4291 void Assembler::emit_data64(jlong data,
4292                             RelocationHolder const& rspec,
4293                             int format) {
4294   assert(imm_operand == 0, "default format must be immediate in this file");
4295   assert(imm_operand == format, "must be immediate");
4296   assert(inst_mark() != NULL, "must be inside InstructionMark");
4297   // Do not use AbstractAssembler::relocate, which is not intended for
4298   // embedded words.  Instead, relocate to the enclosing instruction.
4299   code_section()->relocate(inst_mark(), rspec, format);
4300 #ifdef ASSERT
4301   check_relocation(rspec, format);
4302 #endif
4303   emit_long64(data);
4304 }
4305 
4306 int Assembler::prefix_and_encode(int reg_enc, bool byteinst) {
4307   if (reg_enc >= 8) {
4308     prefix(REX_B);
4309     reg_enc -= 8;
4310   } else if (byteinst && reg_enc >= 4) {
4311     prefix(REX);
4312   }
4313   return reg_enc;
4314 }
4315 
4316 int Assembler::prefixq_and_encode(int reg_enc) {
4317   if (reg_enc < 8) {
4318     prefix(REX_W);
4319   } else {
4320     prefix(REX_WB);
4321     reg_enc -= 8;
4322   }
4323   return reg_enc;
4324 }
4325 
4326 int Assembler::prefix_and_encode(int dst_enc, int src_enc, bool byteinst) {
4327   if (dst_enc < 8) {
4328     if (src_enc >= 8) {
4329       prefix(REX_B);
4330       src_enc -= 8;
4331     } else if (byteinst && src_enc >= 4) {
4332       prefix(REX);
4333     }
4334   } else {
4335     if (src_enc < 8) {
4336       prefix(REX_R);
4337     } else {
4338       prefix(REX_RB);
4339       src_enc -= 8;
4340     }
4341     dst_enc -= 8;
4342   }
4343   return dst_enc << 3 | src_enc;
4344 }
4345 
4346 int Assembler::prefixq_and_encode(int dst_enc, int src_enc) {
4347   if (dst_enc < 8) {
4348     if (src_enc < 8) {
4349       prefix(REX_W);
4350     } else {
4351       prefix(REX_WB);
4352       src_enc -= 8;
4353     }
4354   } else {
4355     if (src_enc < 8) {
4356       prefix(REX_WR);
4357     } else {
4358       prefix(REX_WRB);
4359       src_enc -= 8;
4360     }
4361     dst_enc -= 8;
4362   }
4363   return dst_enc << 3 | src_enc;
4364 }
4365 
4366 void Assembler::prefix(Register reg) {
4367   if (reg->encoding() >= 8) {
4368     prefix(REX_B);
4369   }
4370 }
4371 
4372 void Assembler::prefix(Address adr) {
4373   if (adr.base_needs_rex()) {
4374     if (adr.index_needs_rex()) {
4375       prefix(REX_XB);
4376     } else {
4377       prefix(REX_B);
4378     }
4379   } else {
4380     if (adr.index_needs_rex()) {
4381       prefix(REX_X);
4382     }
4383   }
4384 }
4385 
4386 void Assembler::prefixq(Address adr) {
4387   if (adr.base_needs_rex()) {
4388     if (adr.index_needs_rex()) {
4389       prefix(REX_WXB);
4390     } else {
4391       prefix(REX_WB);
4392     }
4393   } else {
4394     if (adr.index_needs_rex()) {
4395       prefix(REX_WX);
4396     } else {
4397       prefix(REX_W);
4398     }
4399   }
4400 }
4401 
4402 
4403 void Assembler::prefix(Address adr, Register reg, bool byteinst) {
4404   if (reg->encoding() < 8) {
4405     if (adr.base_needs_rex()) {
4406       if (adr.index_needs_rex()) {
4407         prefix(REX_XB);
4408       } else {
4409         prefix(REX_B);
4410       }
4411     } else {
4412       if (adr.index_needs_rex()) {
4413         prefix(REX_X);
4414       } else if (byteinst && reg->encoding() >= 4 ) {
4415         prefix(REX);
4416       }
4417     }
4418   } else {
4419     if (adr.base_needs_rex()) {
4420       if (adr.index_needs_rex()) {
4421         prefix(REX_RXB);
4422       } else {
4423         prefix(REX_RB);
4424       }
4425     } else {
4426       if (adr.index_needs_rex()) {
4427         prefix(REX_RX);
4428       } else {
4429         prefix(REX_R);
4430       }
4431     }
4432   }
4433 }
4434 
4435 void Assembler::prefixq(Address adr, Register src) {
4436   if (src->encoding() < 8) {
4437     if (adr.base_needs_rex()) {
4438       if (adr.index_needs_rex()) {
4439         prefix(REX_WXB);
4440       } else {
4441         prefix(REX_WB);
4442       }
4443     } else {
4444       if (adr.index_needs_rex()) {
4445         prefix(REX_WX);
4446       } else {
4447         prefix(REX_W);
4448       }
4449     }
4450   } else {
4451     if (adr.base_needs_rex()) {
4452       if (adr.index_needs_rex()) {
4453         prefix(REX_WRXB);
4454       } else {
4455         prefix(REX_WRB);
4456       }
4457     } else {
4458       if (adr.index_needs_rex()) {
4459         prefix(REX_WRX);
4460       } else {
4461         prefix(REX_WR);
4462       }
4463     }
4464   }
4465 }
4466 
4467 void Assembler::prefix(Address adr, XMMRegister reg) {
4468   if (reg->encoding() < 8) {
4469     if (adr.base_needs_rex()) {
4470       if (adr.index_needs_rex()) {
4471         prefix(REX_XB);
4472       } else {
4473         prefix(REX_B);
4474       }
4475     } else {
4476       if (adr.index_needs_rex()) {
4477         prefix(REX_X);
4478       }
4479     }
4480   } else {
4481     if (adr.base_needs_rex()) {
4482       if (adr.index_needs_rex()) {
4483         prefix(REX_RXB);
4484       } else {
4485         prefix(REX_RB);
4486       }
4487     } else {
4488       if (adr.index_needs_rex()) {
4489         prefix(REX_RX);
4490       } else {
4491         prefix(REX_R);
4492       }
4493     }
4494   }
4495 }
4496 
4497 void Assembler::prefixq(Address adr, XMMRegister src) {
4498   if (src->encoding() < 8) {
4499     if (adr.base_needs_rex()) {
4500       if (adr.index_needs_rex()) {
4501         prefix(REX_WXB);
4502       } else {
4503         prefix(REX_WB);
4504       }
4505     } else {
4506       if (adr.index_needs_rex()) {
4507         prefix(REX_WX);
4508       } else {
4509         prefix(REX_W);
4510       }
4511     }
4512   } else {
4513     if (adr.base_needs_rex()) {
4514       if (adr.index_needs_rex()) {
4515         prefix(REX_WRXB);
4516       } else {
4517         prefix(REX_WRB);
4518       }
4519     } else {
4520       if (adr.index_needs_rex()) {
4521         prefix(REX_WRX);
4522       } else {
4523         prefix(REX_WR);
4524       }
4525     }
4526   }
4527 }
4528 
4529 void Assembler::adcq(Register dst, int32_t imm32) {
4530   (void) prefixq_and_encode(dst->encoding());
4531   emit_arith(0x81, 0xD0, dst, imm32);
4532 }
4533 
4534 void Assembler::adcq(Register dst, Address src) {
4535   InstructionMark im(this);
4536   prefixq(src, dst);
4537   emit_byte(0x13);
4538   emit_operand(dst, src);
4539 }
4540 
4541 void Assembler::adcq(Register dst, Register src) {
4542   (int) prefixq_and_encode(dst->encoding(), src->encoding());
4543   emit_arith(0x13, 0xC0, dst, src);
4544 }
4545 
4546 void Assembler::addq(Address dst, int32_t imm32) {
4547   InstructionMark im(this);
4548   prefixq(dst);
4549   emit_arith_operand(0x81, rax, dst,imm32);
4550 }
4551 
4552 void Assembler::addq(Address dst, Register src) {
4553   InstructionMark im(this);
4554   prefixq(dst, src);
4555   emit_byte(0x01);
4556   emit_operand(src, dst);
4557 }
4558 
4559 void Assembler::addq(Register dst, int32_t imm32) {
4560   (void) prefixq_and_encode(dst->encoding());
4561   emit_arith(0x81, 0xC0, dst, imm32);
4562 }
4563 
4564 void Assembler::addq(Register dst, Address src) {
4565   InstructionMark im(this);
4566   prefixq(src, dst);
4567   emit_byte(0x03);
4568   emit_operand(dst, src);
4569 }
4570 
4571 void Assembler::addq(Register dst, Register src) {
4572   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4573   emit_arith(0x03, 0xC0, dst, src);
4574 }
4575 
4576 void Assembler::andq(Address dst, int32_t imm32) {
4577   InstructionMark im(this);
4578   prefixq(dst);
4579   emit_byte(0x81);
4580   emit_operand(rsp, dst, 4);
4581   emit_long(imm32);
4582 }
4583 
4584 void Assembler::andq(Register dst, int32_t imm32) {
4585   (void) prefixq_and_encode(dst->encoding());
4586   emit_arith(0x81, 0xE0, dst, imm32);
4587 }
4588 
4589 void Assembler::andq(Register dst, Address src) {
4590   InstructionMark im(this);
4591   prefixq(src, dst);
4592   emit_byte(0x23);
4593   emit_operand(dst, src);
4594 }
4595 
4596 void Assembler::andq(Register dst, Register src) {
4597   (int) prefixq_and_encode(dst->encoding(), src->encoding());
4598   emit_arith(0x23, 0xC0, dst, src);
4599 }
4600 
4601 void Assembler::bsfq(Register dst, Register src) {
4602   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4603   emit_byte(0x0F);
4604   emit_byte(0xBC);
4605   emit_byte(0xC0 | encode);
4606 }
4607 
4608 void Assembler::bsrq(Register dst, Register src) {
4609   assert(!VM_Version::supports_lzcnt(), "encoding is treated as LZCNT");
4610   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4611   emit_byte(0x0F);
4612   emit_byte(0xBD);
4613   emit_byte(0xC0 | encode);
4614 }
4615 
4616 void Assembler::bswapq(Register reg) {
4617   int encode = prefixq_and_encode(reg->encoding());
4618   emit_byte(0x0F);
4619   emit_byte(0xC8 | encode);
4620 }
4621 
4622 void Assembler::cdqq() {
4623   prefix(REX_W);
4624   emit_byte(0x99);
4625 }
4626 
4627 void Assembler::clflush(Address adr) {
4628   prefix(adr);
4629   emit_byte(0x0F);
4630   emit_byte(0xAE);
4631   emit_operand(rdi, adr);
4632 }
4633 
4634 void Assembler::cmovq(Condition cc, Register dst, Register src) {
4635   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4636   emit_byte(0x0F);
4637   emit_byte(0x40 | cc);
4638   emit_byte(0xC0 | encode);
4639 }
4640 
4641 void Assembler::cmovq(Condition cc, Register dst, Address src) {
4642   InstructionMark im(this);
4643   prefixq(src, dst);
4644   emit_byte(0x0F);
4645   emit_byte(0x40 | cc);
4646   emit_operand(dst, src);
4647 }
4648 
4649 void Assembler::cmpq(Address dst, int32_t imm32) {
4650   InstructionMark im(this);
4651   prefixq(dst);
4652   emit_byte(0x81);
4653   emit_operand(rdi, dst, 4);
4654   emit_long(imm32);
4655 }
4656 
4657 void Assembler::cmpq(Register dst, int32_t imm32) {
4658   (void) prefixq_and_encode(dst->encoding());
4659   emit_arith(0x81, 0xF8, dst, imm32);
4660 }
4661 
4662 void Assembler::cmpq(Address dst, Register src) {
4663   InstructionMark im(this);
4664   prefixq(dst, src);
4665   emit_byte(0x3B);
4666   emit_operand(src, dst);
4667 }
4668 
4669 void Assembler::cmpq(Register dst, Register src) {
4670   (void) prefixq_and_encode(dst->encoding(), src->encoding());
4671   emit_arith(0x3B, 0xC0, dst, src);
4672 }
4673 
4674 void Assembler::cmpq(Register dst, Address  src) {
4675   InstructionMark im(this);
4676   prefixq(src, dst);
4677   emit_byte(0x3B);
4678   emit_operand(dst, src);
4679 }
4680 
4681 void Assembler::cmpxchgq(Register reg, Address adr) {
4682   InstructionMark im(this);
4683   prefixq(adr, reg);
4684   emit_byte(0x0F);
4685   emit_byte(0xB1);
4686   emit_operand(reg, adr);
4687 }
4688 
4689 void Assembler::cvtsi2sdq(XMMRegister dst, Register src) {
4690   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4691   int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F2);
4692   emit_byte(0x2A);
4693   emit_byte(0xC0 | encode);
4694 }
4695 
4696 void Assembler::cvtsi2sdq(XMMRegister dst, Address src) {
4697   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4698   InstructionMark im(this);
4699   simd_prefix_q(dst, dst, src, VEX_SIMD_F2);
4700   emit_byte(0x2A);
4701   emit_operand(dst, src);
4702 }
4703 
4704 void Assembler::cvtsi2ssq(XMMRegister dst, Register src) {
4705   NOT_LP64(assert(VM_Version::supports_sse(), ""));
4706   int encode = simd_prefix_and_encode_q(dst, dst, src, VEX_SIMD_F3);
4707   emit_byte(0x2A);
4708   emit_byte(0xC0 | encode);
4709 }
4710 
4711 void Assembler::cvtsi2ssq(XMMRegister dst, Address src) {
4712   NOT_LP64(assert(VM_Version::supports_sse(), ""));
4713   InstructionMark im(this);
4714   simd_prefix_q(dst, dst, src, VEX_SIMD_F3);
4715   emit_byte(0x2A);
4716   emit_operand(dst, src);
4717 }
4718 
4719 void Assembler::cvttsd2siq(Register dst, XMMRegister src) {
4720   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4721   int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F2);
4722   emit_byte(0x2C);
4723   emit_byte(0xC0 | encode);
4724 }
4725 
4726 void Assembler::cvttss2siq(Register dst, XMMRegister src) {
4727   NOT_LP64(assert(VM_Version::supports_sse(), ""));
4728   int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_F3);
4729   emit_byte(0x2C);
4730   emit_byte(0xC0 | encode);
4731 }
4732 
4733 void Assembler::decl(Register dst) {
4734   // Don't use it directly. Use MacroAssembler::decrementl() instead.
4735   // Use two-byte form (one-byte form is a REX prefix in 64-bit mode)
4736   int encode = prefix_and_encode(dst->encoding());
4737   emit_byte(0xFF);
4738   emit_byte(0xC8 | encode);
4739 }
4740 
4741 void Assembler::decq(Register dst) {
4742   // Don't use it directly. Use MacroAssembler::decrementq() instead.
4743   // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
4744   int encode = prefixq_and_encode(dst->encoding());
4745   emit_byte(0xFF);
4746   emit_byte(0xC8 | encode);
4747 }
4748 
4749 void Assembler::decq(Address dst) {
4750   // Don't use it directly. Use MacroAssembler::decrementq() instead.
4751   InstructionMark im(this);
4752   prefixq(dst);
4753   emit_byte(0xFF);
4754   emit_operand(rcx, dst);
4755 }
4756 
4757 void Assembler::fxrstor(Address src) {
4758   prefixq(src);
4759   emit_byte(0x0F);
4760   emit_byte(0xAE);
4761   emit_operand(as_Register(1), src);
4762 }
4763 
4764 void Assembler::fxsave(Address dst) {
4765   prefixq(dst);
4766   emit_byte(0x0F);
4767   emit_byte(0xAE);
4768   emit_operand(as_Register(0), dst);
4769 }
4770 
4771 void Assembler::idivq(Register src) {
4772   int encode = prefixq_and_encode(src->encoding());
4773   emit_byte(0xF7);
4774   emit_byte(0xF8 | encode);
4775 }
4776 
4777 void Assembler::imulq(Register dst, Register src) {
4778   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4779   emit_byte(0x0F);
4780   emit_byte(0xAF);
4781   emit_byte(0xC0 | encode);
4782 }
4783 
4784 void Assembler::imulq(Register dst, Register src, int value) {
4785   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4786   if (is8bit(value)) {
4787     emit_byte(0x6B);
4788     emit_byte(0xC0 | encode);
4789     emit_byte(value & 0xFF);
4790   } else {
4791     emit_byte(0x69);
4792     emit_byte(0xC0 | encode);
4793     emit_long(value);
4794   }
4795 }
4796 
4797 void Assembler::incl(Register dst) {
4798   // Don't use it directly. Use MacroAssembler::incrementl() instead.
4799   // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
4800   int encode = prefix_and_encode(dst->encoding());
4801   emit_byte(0xFF);
4802   emit_byte(0xC0 | encode);
4803 }
4804 
4805 void Assembler::incq(Register dst) {
4806   // Don't use it directly. Use MacroAssembler::incrementq() instead.
4807   // Use two-byte form (one-byte from is a REX prefix in 64-bit mode)
4808   int encode = prefixq_and_encode(dst->encoding());
4809   emit_byte(0xFF);
4810   emit_byte(0xC0 | encode);
4811 }
4812 
4813 void Assembler::incq(Address dst) {
4814   // Don't use it directly. Use MacroAssembler::incrementq() instead.
4815   InstructionMark im(this);
4816   prefixq(dst);
4817   emit_byte(0xFF);
4818   emit_operand(rax, dst);
4819 }
4820 
4821 void Assembler::lea(Register dst, Address src) {
4822   leaq(dst, src);
4823 }
4824 
4825 void Assembler::leaq(Register dst, Address src) {
4826   InstructionMark im(this);
4827   prefixq(src, dst);
4828   emit_byte(0x8D);
4829   emit_operand(dst, src);
4830 }
4831 
4832 void Assembler::mov64(Register dst, int64_t imm64) {
4833   InstructionMark im(this);
4834   int encode = prefixq_and_encode(dst->encoding());
4835   emit_byte(0xB8 | encode);
4836   emit_long64(imm64);
4837 }
4838 
4839 void Assembler::mov_literal64(Register dst, intptr_t imm64, RelocationHolder const& rspec) {
4840   InstructionMark im(this);
4841   int encode = prefixq_and_encode(dst->encoding());
4842   emit_byte(0xB8 | encode);
4843   emit_data64(imm64, rspec);
4844 }
4845 
4846 void Assembler::mov_narrow_oop(Register dst, int32_t imm32, RelocationHolder const& rspec) {
4847   InstructionMark im(this);
4848   int encode = prefix_and_encode(dst->encoding());
4849   emit_byte(0xB8 | encode);
4850   emit_data((int)imm32, rspec, narrow_oop_operand);
4851 }
4852 
4853 void Assembler::mov_narrow_oop(Address dst, int32_t imm32,  RelocationHolder const& rspec) {
4854   InstructionMark im(this);
4855   prefix(dst);
4856   emit_byte(0xC7);
4857   emit_operand(rax, dst, 4);
4858   emit_data((int)imm32, rspec, narrow_oop_operand);
4859 }
4860 
4861 void Assembler::cmp_narrow_oop(Register src1, int32_t imm32, RelocationHolder const& rspec) {
4862   InstructionMark im(this);
4863   int encode = prefix_and_encode(src1->encoding());
4864   emit_byte(0x81);
4865   emit_byte(0xF8 | encode);
4866   emit_data((int)imm32, rspec, narrow_oop_operand);
4867 }
4868 
4869 void Assembler::cmp_narrow_oop(Address src1, int32_t imm32, RelocationHolder const& rspec) {
4870   InstructionMark im(this);
4871   prefix(src1);
4872   emit_byte(0x81);
4873   emit_operand(rax, src1, 4);
4874   emit_data((int)imm32, rspec, narrow_oop_operand);
4875 }
4876 
4877 void Assembler::lzcntq(Register dst, Register src) {
4878   assert(VM_Version::supports_lzcnt(), "encoding is treated as BSR");
4879   emit_byte(0xF3);
4880   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4881   emit_byte(0x0F);
4882   emit_byte(0xBD);
4883   emit_byte(0xC0 | encode);
4884 }
4885 
4886 void Assembler::movdq(XMMRegister dst, Register src) {
4887   // table D-1 says MMX/SSE2
4888   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4889   int encode = simd_prefix_and_encode_q(dst, src, VEX_SIMD_66);
4890   emit_byte(0x6E);
4891   emit_byte(0xC0 | encode);
4892 }
4893 
4894 void Assembler::movdq(Register dst, XMMRegister src) {
4895   // table D-1 says MMX/SSE2
4896   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
4897   // swap src/dst to get correct prefix
4898   int encode = simd_prefix_and_encode_q(src, dst, VEX_SIMD_66);
4899   emit_byte(0x7E);
4900   emit_byte(0xC0 | encode);
4901 }
4902 
4903 void Assembler::movq(Register dst, Register src) {
4904   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4905   emit_byte(0x8B);
4906   emit_byte(0xC0 | encode);
4907 }
4908 
4909 void Assembler::movq(Register dst, Address src) {
4910   InstructionMark im(this);
4911   prefixq(src, dst);
4912   emit_byte(0x8B);
4913   emit_operand(dst, src);
4914 }
4915 
4916 void Assembler::movq(Address dst, Register src) {
4917   InstructionMark im(this);
4918   prefixq(dst, src);
4919   emit_byte(0x89);
4920   emit_operand(src, dst);
4921 }
4922 
4923 void Assembler::movsbq(Register dst, Address src) {
4924   InstructionMark im(this);
4925   prefixq(src, dst);
4926   emit_byte(0x0F);
4927   emit_byte(0xBE);
4928   emit_operand(dst, src);
4929 }
4930 
4931 void Assembler::movsbq(Register dst, Register src) {
4932   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4933   emit_byte(0x0F);
4934   emit_byte(0xBE);
4935   emit_byte(0xC0 | encode);
4936 }
4937 
4938 void Assembler::movslq(Register dst, int32_t imm32) {
4939   // dbx shows movslq(rcx, 3) as movq     $0x0000000049000000,(%rbx)
4940   // and movslq(r8, 3); as movl     $0x0000000048000000,(%rbx)
4941   // as a result we shouldn't use until tested at runtime...
4942   ShouldNotReachHere();
4943   InstructionMark im(this);
4944   int encode = prefixq_and_encode(dst->encoding());
4945   emit_byte(0xC7 | encode);
4946   emit_long(imm32);
4947 }
4948 
4949 void Assembler::movslq(Address dst, int32_t imm32) {
4950   assert(is_simm32(imm32), "lost bits");
4951   InstructionMark im(this);
4952   prefixq(dst);
4953   emit_byte(0xC7);
4954   emit_operand(rax, dst, 4);
4955   emit_long(imm32);
4956 }
4957 
4958 void Assembler::movslq(Register dst, Address src) {
4959   InstructionMark im(this);
4960   prefixq(src, dst);
4961   emit_byte(0x63);
4962   emit_operand(dst, src);
4963 }
4964 
4965 void Assembler::movslq(Register dst, Register src) {
4966   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4967   emit_byte(0x63);
4968   emit_byte(0xC0 | encode);
4969 }
4970 
4971 void Assembler::movswq(Register dst, Address src) {
4972   InstructionMark im(this);
4973   prefixq(src, dst);
4974   emit_byte(0x0F);
4975   emit_byte(0xBF);
4976   emit_operand(dst, src);
4977 }
4978 
4979 void Assembler::movswq(Register dst, Register src) {
4980   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4981   emit_byte(0x0F);
4982   emit_byte(0xBF);
4983   emit_byte(0xC0 | encode);
4984 }
4985 
4986 void Assembler::movzbq(Register dst, Address src) {
4987   InstructionMark im(this);
4988   prefixq(src, dst);
4989   emit_byte(0x0F);
4990   emit_byte(0xB6);
4991   emit_operand(dst, src);
4992 }
4993 
4994 void Assembler::movzbq(Register dst, Register src) {
4995   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
4996   emit_byte(0x0F);
4997   emit_byte(0xB6);
4998   emit_byte(0xC0 | encode);
4999 }
5000 
5001 void Assembler::movzwq(Register dst, Address src) {
5002   InstructionMark im(this);
5003   prefixq(src, dst);
5004   emit_byte(0x0F);
5005   emit_byte(0xB7);
5006   emit_operand(dst, src);
5007 }
5008 
5009 void Assembler::movzwq(Register dst, Register src) {
5010   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
5011   emit_byte(0x0F);
5012   emit_byte(0xB7);
5013   emit_byte(0xC0 | encode);
5014 }
5015 
5016 void Assembler::negq(Register dst) {
5017   int encode = prefixq_and_encode(dst->encoding());
5018   emit_byte(0xF7);
5019   emit_byte(0xD8 | encode);
5020 }
5021 
5022 void Assembler::notq(Register dst) {
5023   int encode = prefixq_and_encode(dst->encoding());
5024   emit_byte(0xF7);
5025   emit_byte(0xD0 | encode);
5026 }
5027 
5028 void Assembler::orq(Address dst, int32_t imm32) {
5029   InstructionMark im(this);
5030   prefixq(dst);
5031   emit_byte(0x81);
5032   emit_operand(rcx, dst, 4);
5033   emit_long(imm32);
5034 }
5035 
5036 void Assembler::orq(Register dst, int32_t imm32) {
5037   (void) prefixq_and_encode(dst->encoding());
5038   emit_arith(0x81, 0xC8, dst, imm32);
5039 }
5040 
5041 void Assembler::orq(Register dst, Address src) {
5042   InstructionMark im(this);
5043   prefixq(src, dst);
5044   emit_byte(0x0B);
5045   emit_operand(dst, src);
5046 }
5047 
5048 void Assembler::orq(Register dst, Register src) {
5049   (void) prefixq_and_encode(dst->encoding(), src->encoding());
5050   emit_arith(0x0B, 0xC0, dst, src);
5051 }
5052 
5053 void Assembler::popa() { // 64bit
5054   movq(r15, Address(rsp, 0));
5055   movq(r14, Address(rsp, wordSize));
5056   movq(r13, Address(rsp, 2 * wordSize));
5057   movq(r12, Address(rsp, 3 * wordSize));
5058   movq(r11, Address(rsp, 4 * wordSize));
5059   movq(r10, Address(rsp, 5 * wordSize));
5060   movq(r9,  Address(rsp, 6 * wordSize));
5061   movq(r8,  Address(rsp, 7 * wordSize));
5062   movq(rdi, Address(rsp, 8 * wordSize));
5063   movq(rsi, Address(rsp, 9 * wordSize));
5064   movq(rbp, Address(rsp, 10 * wordSize));
5065   // skip rsp
5066   movq(rbx, Address(rsp, 12 * wordSize));
5067   movq(rdx, Address(rsp, 13 * wordSize));
5068   movq(rcx, Address(rsp, 14 * wordSize));
5069   movq(rax, Address(rsp, 15 * wordSize));
5070 
5071   addq(rsp, 16 * wordSize);
5072 }
5073 
5074 void Assembler::popcntq(Register dst, Address src) {
5075   assert(VM_Version::supports_popcnt(), "must support");
5076   InstructionMark im(this);
5077   emit_byte(0xF3);
5078   prefixq(src, dst);
5079   emit_byte(0x0F);
5080   emit_byte(0xB8);
5081   emit_operand(dst, src);
5082 }
5083 
5084 void Assembler::popcntq(Register dst, Register src) {
5085   assert(VM_Version::supports_popcnt(), "must support");
5086   emit_byte(0xF3);
5087   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
5088   emit_byte(0x0F);
5089   emit_byte(0xB8);
5090   emit_byte(0xC0 | encode);
5091 }
5092 
5093 void Assembler::popq(Address dst) {
5094   InstructionMark im(this);
5095   prefixq(dst);
5096   emit_byte(0x8F);
5097   emit_operand(rax, dst);
5098 }
5099 
5100 void Assembler::pusha() { // 64bit
5101   // we have to store original rsp.  ABI says that 128 bytes
5102   // below rsp are local scratch.
5103   movq(Address(rsp, -5 * wordSize), rsp);
5104 
5105   subq(rsp, 16 * wordSize);
5106 
5107   movq(Address(rsp, 15 * wordSize), rax);
5108   movq(Address(rsp, 14 * wordSize), rcx);
5109   movq(Address(rsp, 13 * wordSize), rdx);
5110   movq(Address(rsp, 12 * wordSize), rbx);
5111   // skip rsp
5112   movq(Address(rsp, 10 * wordSize), rbp);
5113   movq(Address(rsp, 9 * wordSize), rsi);
5114   movq(Address(rsp, 8 * wordSize), rdi);
5115   movq(Address(rsp, 7 * wordSize), r8);
5116   movq(Address(rsp, 6 * wordSize), r9);
5117   movq(Address(rsp, 5 * wordSize), r10);
5118   movq(Address(rsp, 4 * wordSize), r11);
5119   movq(Address(rsp, 3 * wordSize), r12);
5120   movq(Address(rsp, 2 * wordSize), r13);
5121   movq(Address(rsp, wordSize), r14);
5122   movq(Address(rsp, 0), r15);
5123 }
5124 
5125 void Assembler::pushq(Address src) {
5126   InstructionMark im(this);
5127   prefixq(src);
5128   emit_byte(0xFF);
5129   emit_operand(rsi, src);
5130 }
5131 
5132 void Assembler::rclq(Register dst, int imm8) {
5133   assert(isShiftCount(imm8 >> 1), "illegal shift count");
5134   int encode = prefixq_and_encode(dst->encoding());
5135   if (imm8 == 1) {
5136     emit_byte(0xD1);
5137     emit_byte(0xD0 | encode);
5138   } else {
5139     emit_byte(0xC1);
5140     emit_byte(0xD0 | encode);
5141     emit_byte(imm8);
5142   }
5143 }
5144 void Assembler::sarq(Register dst, int imm8) {
5145   assert(isShiftCount(imm8 >> 1), "illegal shift count");
5146   int encode = prefixq_and_encode(dst->encoding());
5147   if (imm8 == 1) {
5148     emit_byte(0xD1);
5149     emit_byte(0xF8 | encode);
5150   } else {
5151     emit_byte(0xC1);
5152     emit_byte(0xF8 | encode);
5153     emit_byte(imm8);
5154   }
5155 }
5156 
5157 void Assembler::sarq(Register dst) {
5158   int encode = prefixq_and_encode(dst->encoding());
5159   emit_byte(0xD3);
5160   emit_byte(0xF8 | encode);
5161 }
5162 
5163 void Assembler::sbbq(Address dst, int32_t imm32) {
5164   InstructionMark im(this);
5165   prefixq(dst);
5166   emit_arith_operand(0x81, rbx, dst, imm32);
5167 }
5168 
5169 void Assembler::sbbq(Register dst, int32_t imm32) {
5170   (void) prefixq_and_encode(dst->encoding());
5171   emit_arith(0x81, 0xD8, dst, imm32);
5172 }
5173 
5174 void Assembler::sbbq(Register dst, Address src) {
5175   InstructionMark im(this);
5176   prefixq(src, dst);
5177   emit_byte(0x1B);
5178   emit_operand(dst, src);
5179 }
5180 
5181 void Assembler::sbbq(Register dst, Register src) {
5182   (void) prefixq_and_encode(dst->encoding(), src->encoding());
5183   emit_arith(0x1B, 0xC0, dst, src);
5184 }
5185 
5186 void Assembler::shlq(Register dst, int imm8) {
5187   assert(isShiftCount(imm8 >> 1), "illegal shift count");
5188   int encode = prefixq_and_encode(dst->encoding());
5189   if (imm8 == 1) {
5190     emit_byte(0xD1);
5191     emit_byte(0xE0 | encode);
5192   } else {
5193     emit_byte(0xC1);
5194     emit_byte(0xE0 | encode);
5195     emit_byte(imm8);
5196   }
5197 }
5198 
5199 void Assembler::shlq(Register dst) {
5200   int encode = prefixq_and_encode(dst->encoding());
5201   emit_byte(0xD3);
5202   emit_byte(0xE0 | encode);
5203 }
5204 
5205 void Assembler::shrq(Register dst, int imm8) {
5206   assert(isShiftCount(imm8 >> 1), "illegal shift count");
5207   int encode = prefixq_and_encode(dst->encoding());
5208   emit_byte(0xC1);
5209   emit_byte(0xE8 | encode);
5210   emit_byte(imm8);
5211 }
5212 
5213 void Assembler::shrq(Register dst) {
5214   int encode = prefixq_and_encode(dst->encoding());
5215   emit_byte(0xD3);
5216   emit_byte(0xE8 | encode);
5217 }
5218 
5219 void Assembler::subq(Address dst, int32_t imm32) {
5220   InstructionMark im(this);
5221   prefixq(dst);
5222   emit_arith_operand(0x81, rbp, dst, imm32);
5223 }
5224 
5225 void Assembler::subq(Address dst, Register src) {
5226   InstructionMark im(this);
5227   prefixq(dst, src);
5228   emit_byte(0x29);
5229   emit_operand(src, dst);
5230 }
5231 
5232 void Assembler::subq(Register dst, int32_t imm32) {
5233   (void) prefixq_and_encode(dst->encoding());
5234   emit_arith(0x81, 0xE8, dst, imm32);
5235 }
5236 
5237 // Force generation of a 4 byte immediate value even if it fits into 8bit
5238 void Assembler::subq_imm32(Register dst, int32_t imm32) {
5239   (void) prefixq_and_encode(dst->encoding());
5240   emit_arith_imm32(0x81, 0xE8, dst, imm32);
5241 }
5242 
5243 void Assembler::subq(Register dst, Address src) {
5244   InstructionMark im(this);
5245   prefixq(src, dst);
5246   emit_byte(0x2B);
5247   emit_operand(dst, src);
5248 }
5249 
5250 void Assembler::subq(Register dst, Register src) {
5251   (void) prefixq_and_encode(dst->encoding(), src->encoding());
5252   emit_arith(0x2B, 0xC0, dst, src);
5253 }
5254 
5255 void Assembler::testq(Register dst, int32_t imm32) {
5256   // not using emit_arith because test
5257   // doesn't support sign-extension of
5258   // 8bit operands
5259   int encode = dst->encoding();
5260   if (encode == 0) {
5261     prefix(REX_W);
5262     emit_byte(0xA9);
5263   } else {
5264     encode = prefixq_and_encode(encode);
5265     emit_byte(0xF7);
5266     emit_byte(0xC0 | encode);
5267   }
5268   emit_long(imm32);
5269 }
5270 
5271 void Assembler::testq(Register dst, Register src) {
5272   (void) prefixq_and_encode(dst->encoding(), src->encoding());
5273   emit_arith(0x85, 0xC0, dst, src);
5274 }
5275 
5276 void Assembler::xaddq(Address dst, Register src) {
5277   InstructionMark im(this);
5278   prefixq(dst, src);
5279   emit_byte(0x0F);
5280   emit_byte(0xC1);
5281   emit_operand(src, dst);
5282 }
5283 
5284 void Assembler::xchgq(Register dst, Address src) {
5285   InstructionMark im(this);
5286   prefixq(src, dst);
5287   emit_byte(0x87);
5288   emit_operand(dst, src);
5289 }
5290 
5291 void Assembler::xchgq(Register dst, Register src) {
5292   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
5293   emit_byte(0x87);
5294   emit_byte(0xc0 | encode);
5295 }
5296 
5297 void Assembler::xorq(Register dst, Register src) {
5298   (void) prefixq_and_encode(dst->encoding(), src->encoding());
5299   emit_arith(0x33, 0xC0, dst, src);
5300 }
5301 
5302 void Assembler::xorq(Register dst, Address src) {
5303   InstructionMark im(this);
5304   prefixq(src, dst);
5305   emit_byte(0x33);
5306   emit_operand(dst, src);
5307 }
5308 
5309 #endif // !LP64
5310 
5311 static Assembler::Condition reverse[] = {
5312     Assembler::noOverflow     /* overflow      = 0x0 */ ,
5313     Assembler::overflow       /* noOverflow    = 0x1 */ ,
5314     Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
5315     Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
5316     Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
5317     Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
5318     Assembler::above          /* belowEqual    = 0x6 */ ,
5319     Assembler::belowEqual     /* above         = 0x7 */ ,
5320     Assembler::positive       /* negative      = 0x8 */ ,
5321     Assembler::negative       /* positive      = 0x9 */ ,
5322     Assembler::noParity       /* parity        = 0xa */ ,
5323     Assembler::parity         /* noParity      = 0xb */ ,
5324     Assembler::greaterEqual   /* less          = 0xc */ ,
5325     Assembler::less           /* greaterEqual  = 0xd */ ,
5326     Assembler::greater        /* lessEqual     = 0xe */ ,
5327     Assembler::lessEqual      /* greater       = 0xf, */
5328 
5329 };
5330 
5331 
5332 // Implementation of MacroAssembler
5333 
5334 // First all the versions that have distinct versions depending on 32/64 bit
5335 // Unless the difference is trivial (1 line or so).
5336 
5337 #ifndef _LP64
5338 
5339 // 32bit versions
5340 
5341 Address MacroAssembler::as_Address(AddressLiteral adr) {
5342   return Address(adr.target(), adr.rspec());
5343 }
5344 
5345 Address MacroAssembler::as_Address(ArrayAddress adr) {
5346   return Address::make_array(adr);
5347 }
5348 
5349 int MacroAssembler::biased_locking_enter(Register lock_reg,
5350                                          Register obj_reg,
5351                                          Register swap_reg,
5352                                          Register tmp_reg,
5353                                          bool swap_reg_contains_mark,
5354                                          Label& done,
5355                                          Label* slow_case,
5356                                          BiasedLockingCounters* counters) {
5357   assert(UseBiasedLocking, "why call this otherwise?");
5358   assert(swap_reg == rax, "swap_reg must be rax, for cmpxchg");
5359   assert_different_registers(lock_reg, obj_reg, swap_reg);
5360 
5361   if (PrintBiasedLockingStatistics && counters == NULL)
5362     counters = BiasedLocking::counters();
5363 
5364   bool need_tmp_reg = false;
5365   if (tmp_reg == noreg) {
5366     need_tmp_reg = true;
5367     tmp_reg = lock_reg;
5368   } else {
5369     assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
5370   }
5371   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
5372   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
5373   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
5374   Address saved_mark_addr(lock_reg, 0);
5375 
5376   // Biased locking
5377   // See whether the lock is currently biased toward our thread and
5378   // whether the epoch is still valid
5379   // Note that the runtime guarantees sufficient alignment of JavaThread
5380   // pointers to allow age to be placed into low bits
5381   // First check to see whether biasing is even enabled for this object
5382   Label cas_label;
5383   int null_check_offset = -1;
5384   if (!swap_reg_contains_mark) {
5385     null_check_offset = offset();
5386     movl(swap_reg, mark_addr);
5387   }
5388   if (need_tmp_reg) {
5389     push(tmp_reg);
5390   }
5391   movl(tmp_reg, swap_reg);
5392   andl(tmp_reg, markOopDesc::biased_lock_mask_in_place);
5393   cmpl(tmp_reg, markOopDesc::biased_lock_pattern);
5394   if (need_tmp_reg) {
5395     pop(tmp_reg);
5396   }
5397   jcc(Assembler::notEqual, cas_label);
5398   // The bias pattern is present in the object's header. Need to check
5399   // whether the bias owner and the epoch are both still current.
5400   // Note that because there is no current thread register on x86 we
5401   // need to store off the mark word we read out of the object to
5402   // avoid reloading it and needing to recheck invariants below. This
5403   // store is unfortunate but it makes the overall code shorter and
5404   // simpler.
5405   movl(saved_mark_addr, swap_reg);
5406   if (need_tmp_reg) {
5407     push(tmp_reg);
5408   }
5409   get_thread(tmp_reg);
5410   xorl(swap_reg, tmp_reg);
5411   if (swap_reg_contains_mark) {
5412     null_check_offset = offset();
5413   }
5414   movl(tmp_reg, klass_addr);
5415   xorl(swap_reg, Address(tmp_reg, Klass::prototype_header_offset()));
5416   andl(swap_reg, ~((int) markOopDesc::age_mask_in_place));
5417   if (need_tmp_reg) {
5418     pop(tmp_reg);
5419   }
5420   if (counters != NULL) {
5421     cond_inc32(Assembler::zero,
5422                ExternalAddress((address)counters->biased_lock_entry_count_addr()));
5423   }
5424   jcc(Assembler::equal, done);
5425 
5426   Label try_revoke_bias;
5427   Label try_rebias;
5428 
5429   // At this point we know that the header has the bias pattern and
5430   // that we are not the bias owner in the current epoch. We need to
5431   // figure out more details about the state of the header in order to
5432   // know what operations can be legally performed on the object's
5433   // header.
5434 
5435   // If the low three bits in the xor result aren't clear, that means
5436   // the prototype header is no longer biased and we have to revoke
5437   // the bias on this object.
5438   testl(swap_reg, markOopDesc::biased_lock_mask_in_place);
5439   jcc(Assembler::notZero, try_revoke_bias);
5440 
5441   // Biasing is still enabled for this data type. See whether the
5442   // epoch of the current bias is still valid, meaning that the epoch
5443   // bits of the mark word are equal to the epoch bits of the
5444   // prototype header. (Note that the prototype header's epoch bits
5445   // only change at a safepoint.) If not, attempt to rebias the object
5446   // toward the current thread. Note that we must be absolutely sure
5447   // that the current epoch is invalid in order to do this because
5448   // otherwise the manipulations it performs on the mark word are
5449   // illegal.
5450   testl(swap_reg, markOopDesc::epoch_mask_in_place);
5451   jcc(Assembler::notZero, try_rebias);
5452 
5453   // The epoch of the current bias is still valid but we know nothing
5454   // about the owner; it might be set or it might be clear. Try to
5455   // acquire the bias of the object using an atomic operation. If this
5456   // fails we will go in to the runtime to revoke the object's bias.
5457   // Note that we first construct the presumed unbiased header so we
5458   // don't accidentally blow away another thread's valid bias.
5459   movl(swap_reg, saved_mark_addr);
5460   andl(swap_reg,
5461        markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
5462   if (need_tmp_reg) {
5463     push(tmp_reg);
5464   }
5465   get_thread(tmp_reg);
5466   orl(tmp_reg, swap_reg);
5467   if (os::is_MP()) {
5468     lock();
5469   }
5470   cmpxchgptr(tmp_reg, Address(obj_reg, 0));
5471   if (need_tmp_reg) {
5472     pop(tmp_reg);
5473   }
5474   // If the biasing toward our thread failed, this means that
5475   // another thread succeeded in biasing it toward itself and we
5476   // need to revoke that bias. The revocation will occur in the
5477   // interpreter runtime in the slow case.
5478   if (counters != NULL) {
5479     cond_inc32(Assembler::zero,
5480                ExternalAddress((address)counters->anonymously_biased_lock_entry_count_addr()));
5481   }
5482   if (slow_case != NULL) {
5483     jcc(Assembler::notZero, *slow_case);
5484   }
5485   jmp(done);
5486 
5487   bind(try_rebias);
5488   // At this point we know the epoch has expired, meaning that the
5489   // current "bias owner", if any, is actually invalid. Under these
5490   // circumstances _only_, we are allowed to use the current header's
5491   // value as the comparison value when doing the cas to acquire the
5492   // bias in the current epoch. In other words, we allow transfer of
5493   // the bias from one thread to another directly in this situation.
5494   //
5495   // FIXME: due to a lack of registers we currently blow away the age
5496   // bits in this situation. Should attempt to preserve them.
5497   if (need_tmp_reg) {
5498     push(tmp_reg);
5499   }
5500   get_thread(tmp_reg);
5501   movl(swap_reg, klass_addr);
5502   orl(tmp_reg, Address(swap_reg, Klass::prototype_header_offset()));
5503   movl(swap_reg, saved_mark_addr);
5504   if (os::is_MP()) {
5505     lock();
5506   }
5507   cmpxchgptr(tmp_reg, Address(obj_reg, 0));
5508   if (need_tmp_reg) {
5509     pop(tmp_reg);
5510   }
5511   // If the biasing toward our thread failed, then another thread
5512   // succeeded in biasing it toward itself and we need to revoke that
5513   // bias. The revocation will occur in the runtime in the slow case.
5514   if (counters != NULL) {
5515     cond_inc32(Assembler::zero,
5516                ExternalAddress((address)counters->rebiased_lock_entry_count_addr()));
5517   }
5518   if (slow_case != NULL) {
5519     jcc(Assembler::notZero, *slow_case);
5520   }
5521   jmp(done);
5522 
5523   bind(try_revoke_bias);
5524   // The prototype mark in the klass doesn't have the bias bit set any
5525   // more, indicating that objects of this data type are not supposed
5526   // to be biased any more. We are going to try to reset the mark of
5527   // this object to the prototype value and fall through to the
5528   // CAS-based locking scheme. Note that if our CAS fails, it means
5529   // that another thread raced us for the privilege of revoking the
5530   // bias of this particular object, so it's okay to continue in the
5531   // normal locking code.
5532   //
5533   // FIXME: due to a lack of registers we currently blow away the age
5534   // bits in this situation. Should attempt to preserve them.
5535   movl(swap_reg, saved_mark_addr);
5536   if (need_tmp_reg) {
5537     push(tmp_reg);
5538   }
5539   movl(tmp_reg, klass_addr);
5540   movl(tmp_reg, Address(tmp_reg, Klass::prototype_header_offset()));
5541   if (os::is_MP()) {
5542     lock();
5543   }
5544   cmpxchgptr(tmp_reg, Address(obj_reg, 0));
5545   if (need_tmp_reg) {
5546     pop(tmp_reg);
5547   }
5548   // Fall through to the normal CAS-based lock, because no matter what
5549   // the result of the above CAS, some thread must have succeeded in
5550   // removing the bias bit from the object's header.
5551   if (counters != NULL) {
5552     cond_inc32(Assembler::zero,
5553                ExternalAddress((address)counters->revoked_lock_entry_count_addr()));
5554   }
5555 
5556   bind(cas_label);
5557 
5558   return null_check_offset;
5559 }
5560 void MacroAssembler::call_VM_leaf_base(address entry_point,
5561                                        int number_of_arguments) {
5562   call(RuntimeAddress(entry_point));
5563   increment(rsp, number_of_arguments * wordSize);
5564 }
5565 
5566 void MacroAssembler::cmpoop(Address src1, jobject obj) {
5567   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
5568 }
5569 
5570 void MacroAssembler::cmpoop(Register src1, jobject obj) {
5571   cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
5572 }
5573 
5574 void MacroAssembler::extend_sign(Register hi, Register lo) {
5575   // According to Intel Doc. AP-526, "Integer Divide", p.18.
5576   if (VM_Version::is_P6() && hi == rdx && lo == rax) {
5577     cdql();
5578   } else {
5579     movl(hi, lo);
5580     sarl(hi, 31);
5581   }
5582 }
5583 
5584 void MacroAssembler::jC2(Register tmp, Label& L) {
5585   // set parity bit if FPU flag C2 is set (via rax)
5586   save_rax(tmp);
5587   fwait(); fnstsw_ax();
5588   sahf();
5589   restore_rax(tmp);
5590   // branch
5591   jcc(Assembler::parity, L);
5592 }
5593 
5594 void MacroAssembler::jnC2(Register tmp, Label& L) {
5595   // set parity bit if FPU flag C2 is set (via rax)
5596   save_rax(tmp);
5597   fwait(); fnstsw_ax();
5598   sahf();
5599   restore_rax(tmp);
5600   // branch
5601   jcc(Assembler::noParity, L);
5602 }
5603 
5604 // 32bit can do a case table jump in one instruction but we no longer allow the base
5605 // to be installed in the Address class
5606 void MacroAssembler::jump(ArrayAddress entry) {
5607   jmp(as_Address(entry));
5608 }
5609 
5610 // Note: y_lo will be destroyed
5611 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
5612   // Long compare for Java (semantics as described in JVM spec.)
5613   Label high, low, done;
5614 
5615   cmpl(x_hi, y_hi);
5616   jcc(Assembler::less, low);
5617   jcc(Assembler::greater, high);
5618   // x_hi is the return register
5619   xorl(x_hi, x_hi);
5620   cmpl(x_lo, y_lo);
5621   jcc(Assembler::below, low);
5622   jcc(Assembler::equal, done);
5623 
5624   bind(high);
5625   xorl(x_hi, x_hi);
5626   increment(x_hi);
5627   jmp(done);
5628 
5629   bind(low);
5630   xorl(x_hi, x_hi);
5631   decrementl(x_hi);
5632 
5633   bind(done);
5634 }
5635 
5636 void MacroAssembler::lea(Register dst, AddressLiteral src) {
5637     mov_literal32(dst, (int32_t)src.target(), src.rspec());
5638 }
5639 
5640 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
5641   // leal(dst, as_Address(adr));
5642   // see note in movl as to why we must use a move
5643   mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
5644 }
5645 
5646 void MacroAssembler::leave() {
5647   mov(rsp, rbp);
5648   pop(rbp);
5649 }
5650 
5651 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
5652   // Multiplication of two Java long values stored on the stack
5653   // as illustrated below. Result is in rdx:rax.
5654   //
5655   // rsp ---> [  ??  ] \               \
5656   //            ....    | y_rsp_offset  |
5657   //          [ y_lo ] /  (in bytes)    | x_rsp_offset
5658   //          [ y_hi ]                  | (in bytes)
5659   //            ....                    |
5660   //          [ x_lo ]                 /
5661   //          [ x_hi ]
5662   //            ....
5663   //
5664   // Basic idea: lo(result) = lo(x_lo * y_lo)
5665   //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
5666   Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
5667   Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
5668   Label quick;
5669   // load x_hi, y_hi and check if quick
5670   // multiplication is possible
5671   movl(rbx, x_hi);
5672   movl(rcx, y_hi);
5673   movl(rax, rbx);
5674   orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
5675   jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
5676   // do full multiplication
5677   // 1st step
5678   mull(y_lo);                                    // x_hi * y_lo
5679   movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
5680   // 2nd step
5681   movl(rax, x_lo);
5682   mull(rcx);                                     // x_lo * y_hi
5683   addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
5684   // 3rd step
5685   bind(quick);                                   // note: rbx, = 0 if quick multiply!
5686   movl(rax, x_lo);
5687   mull(y_lo);                                    // x_lo * y_lo
5688   addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
5689 }
5690 
5691 void MacroAssembler::lneg(Register hi, Register lo) {
5692   negl(lo);
5693   adcl(hi, 0);
5694   negl(hi);
5695 }
5696 
5697 void MacroAssembler::lshl(Register hi, Register lo) {
5698   // Java shift left long support (semantics as described in JVM spec., p.305)
5699   // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
5700   // shift value is in rcx !
5701   assert(hi != rcx, "must not use rcx");
5702   assert(lo != rcx, "must not use rcx");
5703   const Register s = rcx;                        // shift count
5704   const int      n = BitsPerWord;
5705   Label L;
5706   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
5707   cmpl(s, n);                                    // if (s < n)
5708   jcc(Assembler::less, L);                       // else (s >= n)
5709   movl(hi, lo);                                  // x := x << n
5710   xorl(lo, lo);
5711   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
5712   bind(L);                                       // s (mod n) < n
5713   shldl(hi, lo);                                 // x := x << s
5714   shll(lo);
5715 }
5716 
5717 
5718 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
5719   // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
5720   // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
5721   assert(hi != rcx, "must not use rcx");
5722   assert(lo != rcx, "must not use rcx");
5723   const Register s = rcx;                        // shift count
5724   const int      n = BitsPerWord;
5725   Label L;
5726   andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
5727   cmpl(s, n);                                    // if (s < n)
5728   jcc(Assembler::less, L);                       // else (s >= n)
5729   movl(lo, hi);                                  // x := x >> n
5730   if (sign_extension) sarl(hi, 31);
5731   else                xorl(hi, hi);
5732   // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
5733   bind(L);                                       // s (mod n) < n
5734   shrdl(lo, hi);                                 // x := x >> s
5735   if (sign_extension) sarl(hi);
5736   else                shrl(hi);
5737 }
5738 
5739 void MacroAssembler::movoop(Register dst, jobject obj) {
5740   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
5741 }
5742 
5743 void MacroAssembler::movoop(Address dst, jobject obj) {
5744   mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
5745 }
5746 
5747 void MacroAssembler::movptr(Register dst, AddressLiteral src) {
5748   if (src.is_lval()) {
5749     mov_literal32(dst, (intptr_t)src.target(), src.rspec());
5750   } else {
5751     movl(dst, as_Address(src));
5752   }
5753 }
5754 
5755 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
5756   movl(as_Address(dst), src);
5757 }
5758 
5759 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
5760   movl(dst, as_Address(src));
5761 }
5762 
5763 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
5764 void MacroAssembler::movptr(Address dst, intptr_t src) {
5765   movl(dst, src);
5766 }
5767 
5768 
5769 void MacroAssembler::pop_callee_saved_registers() {
5770   pop(rcx);
5771   pop(rdx);
5772   pop(rdi);
5773   pop(rsi);
5774 }
5775 
5776 void MacroAssembler::pop_fTOS() {
5777   fld_d(Address(rsp, 0));
5778   addl(rsp, 2 * wordSize);
5779 }
5780 
5781 void MacroAssembler::push_callee_saved_registers() {
5782   push(rsi);
5783   push(rdi);
5784   push(rdx);
5785   push(rcx);
5786 }
5787 
5788 void MacroAssembler::push_fTOS() {
5789   subl(rsp, 2 * wordSize);
5790   fstp_d(Address(rsp, 0));
5791 }
5792 
5793 
5794 void MacroAssembler::pushoop(jobject obj) {
5795   push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
5796 }
5797 
5798 
5799 void MacroAssembler::pushptr(AddressLiteral src) {
5800   if (src.is_lval()) {
5801     push_literal32((int32_t)src.target(), src.rspec());
5802   } else {
5803     pushl(as_Address(src));
5804   }
5805 }
5806 
5807 void MacroAssembler::set_word_if_not_zero(Register dst) {
5808   xorl(dst, dst);
5809   set_byte_if_not_zero(dst);
5810 }
5811 
5812 static void pass_arg0(MacroAssembler* masm, Register arg) {
5813   masm->push(arg);
5814 }
5815 
5816 static void pass_arg1(MacroAssembler* masm, Register arg) {
5817   masm->push(arg);
5818 }
5819 
5820 static void pass_arg2(MacroAssembler* masm, Register arg) {
5821   masm->push(arg);
5822 }
5823 
5824 static void pass_arg3(MacroAssembler* masm, Register arg) {
5825   masm->push(arg);
5826 }
5827 
5828 #ifndef PRODUCT
5829 extern "C" void findpc(intptr_t x);
5830 #endif
5831 
5832 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
5833   // In order to get locks to work, we need to fake a in_VM state
5834   JavaThread* thread = JavaThread::current();
5835   JavaThreadState saved_state = thread->thread_state();
5836   thread->set_thread_state(_thread_in_vm);
5837   if (ShowMessageBoxOnError) {
5838     JavaThread* thread = JavaThread::current();
5839     JavaThreadState saved_state = thread->thread_state();
5840     thread->set_thread_state(_thread_in_vm);
5841     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
5842       ttyLocker ttyl;
5843       BytecodeCounter::print();
5844     }
5845     // To see where a verify_oop failed, get $ebx+40/X for this frame.
5846     // This is the value of eip which points to where verify_oop will return.
5847     if (os::message_box(msg, "Execution stopped, print registers?")) {
5848       ttyLocker ttyl;
5849       tty->print_cr("eip = 0x%08x", eip);
5850 #ifndef PRODUCT
5851       if ((WizardMode || Verbose) && PrintMiscellaneous) {
5852         tty->cr();
5853         findpc(eip);
5854         tty->cr();
5855       }
5856 #endif
5857       tty->print_cr("rax = 0x%08x", rax);
5858       tty->print_cr("rbx = 0x%08x", rbx);
5859       tty->print_cr("rcx = 0x%08x", rcx);
5860       tty->print_cr("rdx = 0x%08x", rdx);
5861       tty->print_cr("rdi = 0x%08x", rdi);
5862       tty->print_cr("rsi = 0x%08x", rsi);
5863       tty->print_cr("rbp = 0x%08x", rbp);
5864       tty->print_cr("rsp = 0x%08x", rsp);
5865       BREAKPOINT;
5866       assert(false, "start up GDB");
5867     }
5868   } else {
5869     ttyLocker ttyl;
5870     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
5871     assert(false, err_msg("DEBUG MESSAGE: %s", msg));
5872   }
5873   ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
5874 }
5875 
5876 void MacroAssembler::stop(const char* msg) {
5877   ExternalAddress message((address)msg);
5878   // push address of message
5879   pushptr(message.addr());
5880   { Label L; call(L, relocInfo::none); bind(L); }     // push eip
5881   pusha();                                           // push registers
5882   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
5883   hlt();
5884 }
5885 
5886 void MacroAssembler::warn(const char* msg) {
5887   push_CPU_state();
5888 
5889   ExternalAddress message((address) msg);
5890   // push address of message
5891   pushptr(message.addr());
5892 
5893   call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
5894   addl(rsp, wordSize);       // discard argument
5895   pop_CPU_state();
5896 }
5897 
5898 #else // _LP64
5899 
5900 // 64 bit versions
5901 
5902 Address MacroAssembler::as_Address(AddressLiteral adr) {
5903   // amd64 always does this as a pc-rel
5904   // we can be absolute or disp based on the instruction type
5905   // jmp/call are displacements others are absolute
5906   assert(!adr.is_lval(), "must be rval");
5907   assert(reachable(adr), "must be");
5908   return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
5909 
5910 }
5911 
5912 Address MacroAssembler::as_Address(ArrayAddress adr) {
5913   AddressLiteral base = adr.base();
5914   lea(rscratch1, base);
5915   Address index = adr.index();
5916   assert(index._disp == 0, "must not have disp"); // maybe it can?
5917   Address array(rscratch1, index._index, index._scale, index._disp);
5918   return array;
5919 }
5920 
5921 int MacroAssembler::biased_locking_enter(Register lock_reg,
5922                                          Register obj_reg,
5923                                          Register swap_reg,
5924                                          Register tmp_reg,
5925                                          bool swap_reg_contains_mark,
5926                                          Label& done,
5927                                          Label* slow_case,
5928                                          BiasedLockingCounters* counters) {
5929   assert(UseBiasedLocking, "why call this otherwise?");
5930   assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
5931   assert(tmp_reg != noreg, "tmp_reg must be supplied");
5932   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
5933   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
5934   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
5935   Address saved_mark_addr(lock_reg, 0);
5936 
5937   if (PrintBiasedLockingStatistics && counters == NULL)
5938     counters = BiasedLocking::counters();
5939 
5940   // Biased locking
5941   // See whether the lock is currently biased toward our thread and
5942   // whether the epoch is still valid
5943   // Note that the runtime guarantees sufficient alignment of JavaThread
5944   // pointers to allow age to be placed into low bits
5945   // First check to see whether biasing is even enabled for this object
5946   Label cas_label;
5947   int null_check_offset = -1;
5948   if (!swap_reg_contains_mark) {
5949     null_check_offset = offset();
5950     movq(swap_reg, mark_addr);
5951   }
5952   movq(tmp_reg, swap_reg);
5953   andq(tmp_reg, markOopDesc::biased_lock_mask_in_place);
5954   cmpq(tmp_reg, markOopDesc::biased_lock_pattern);
5955   jcc(Assembler::notEqual, cas_label);
5956   // The bias pattern is present in the object's header. Need to check
5957   // whether the bias owner and the epoch are both still current.
5958   load_prototype_header(tmp_reg, obj_reg);
5959   orq(tmp_reg, r15_thread);
5960   xorq(tmp_reg, swap_reg);
5961   andq(tmp_reg, ~((int) markOopDesc::age_mask_in_place));
5962   if (counters != NULL) {
5963     cond_inc32(Assembler::zero,
5964                ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
5965   }
5966   jcc(Assembler::equal, done);
5967 
5968   Label try_revoke_bias;
5969   Label try_rebias;
5970 
5971   // At this point we know that the header has the bias pattern and
5972   // that we are not the bias owner in the current epoch. We need to
5973   // figure out more details about the state of the header in order to
5974   // know what operations can be legally performed on the object's
5975   // header.
5976 
5977   // If the low three bits in the xor result aren't clear, that means
5978   // the prototype header is no longer biased and we have to revoke
5979   // the bias on this object.
5980   testq(tmp_reg, markOopDesc::biased_lock_mask_in_place);
5981   jcc(Assembler::notZero, try_revoke_bias);
5982 
5983   // Biasing is still enabled for this data type. See whether the
5984   // epoch of the current bias is still valid, meaning that the epoch
5985   // bits of the mark word are equal to the epoch bits of the
5986   // prototype header. (Note that the prototype header's epoch bits
5987   // only change at a safepoint.) If not, attempt to rebias the object
5988   // toward the current thread. Note that we must be absolutely sure
5989   // that the current epoch is invalid in order to do this because
5990   // otherwise the manipulations it performs on the mark word are
5991   // illegal.
5992   testq(tmp_reg, markOopDesc::epoch_mask_in_place);
5993   jcc(Assembler::notZero, try_rebias);
5994 
5995   // The epoch of the current bias is still valid but we know nothing
5996   // about the owner; it might be set or it might be clear. Try to
5997   // acquire the bias of the object using an atomic operation. If this
5998   // fails we will go in to the runtime to revoke the object's bias.
5999   // Note that we first construct the presumed unbiased header so we
6000   // don't accidentally blow away another thread's valid bias.
6001   andq(swap_reg,
6002        markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
6003   movq(tmp_reg, swap_reg);
6004   orq(tmp_reg, r15_thread);
6005   if (os::is_MP()) {
6006     lock();
6007   }
6008   cmpxchgq(tmp_reg, Address(obj_reg, 0));
6009   // If the biasing toward our thread failed, this means that
6010   // another thread succeeded in biasing it toward itself and we
6011   // need to revoke that bias. The revocation will occur in the
6012   // interpreter runtime in the slow case.
6013   if (counters != NULL) {
6014     cond_inc32(Assembler::zero,
6015                ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
6016   }
6017   if (slow_case != NULL) {
6018     jcc(Assembler::notZero, *slow_case);
6019   }
6020   jmp(done);
6021 
6022   bind(try_rebias);
6023   // At this point we know the epoch has expired, meaning that the
6024   // current "bias owner", if any, is actually invalid. Under these
6025   // circumstances _only_, we are allowed to use the current header's
6026   // value as the comparison value when doing the cas to acquire the
6027   // bias in the current epoch. In other words, we allow transfer of
6028   // the bias from one thread to another directly in this situation.
6029   //
6030   // FIXME: due to a lack of registers we currently blow away the age
6031   // bits in this situation. Should attempt to preserve them.
6032   load_prototype_header(tmp_reg, obj_reg);
6033   orq(tmp_reg, r15_thread);
6034   if (os::is_MP()) {
6035     lock();
6036   }
6037   cmpxchgq(tmp_reg, Address(obj_reg, 0));
6038   // If the biasing toward our thread failed, then another thread
6039   // succeeded in biasing it toward itself and we need to revoke that
6040   // bias. The revocation will occur in the runtime in the slow case.
6041   if (counters != NULL) {
6042     cond_inc32(Assembler::zero,
6043                ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
6044   }
6045   if (slow_case != NULL) {
6046     jcc(Assembler::notZero, *slow_case);
6047   }
6048   jmp(done);
6049 
6050   bind(try_revoke_bias);
6051   // The prototype mark in the klass doesn't have the bias bit set any
6052   // more, indicating that objects of this data type are not supposed
6053   // to be biased any more. We are going to try to reset the mark of
6054   // this object to the prototype value and fall through to the
6055   // CAS-based locking scheme. Note that if our CAS fails, it means
6056   // that another thread raced us for the privilege of revoking the
6057   // bias of this particular object, so it's okay to continue in the
6058   // normal locking code.
6059   //
6060   // FIXME: due to a lack of registers we currently blow away the age
6061   // bits in this situation. Should attempt to preserve them.
6062   load_prototype_header(tmp_reg, obj_reg);
6063   if (os::is_MP()) {
6064     lock();
6065   }
6066   cmpxchgq(tmp_reg, Address(obj_reg, 0));
6067   // Fall through to the normal CAS-based lock, because no matter what
6068   // the result of the above CAS, some thread must have succeeded in
6069   // removing the bias bit from the object's header.
6070   if (counters != NULL) {
6071     cond_inc32(Assembler::zero,
6072                ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
6073   }
6074 
6075   bind(cas_label);
6076 
6077   return null_check_offset;
6078 }
6079 
6080 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
6081   Label L, E;
6082 
6083 #ifdef _WIN64
6084   // Windows always allocates space for it's register args
6085   assert(num_args <= 4, "only register arguments supported");
6086   subq(rsp,  frame::arg_reg_save_area_bytes);
6087 #endif
6088 
6089   // Align stack if necessary
6090   testl(rsp, 15);
6091   jcc(Assembler::zero, L);
6092 
6093   subq(rsp, 8);
6094   {
6095     call(RuntimeAddress(entry_point));
6096   }
6097   addq(rsp, 8);
6098   jmp(E);
6099 
6100   bind(L);
6101   {
6102     call(RuntimeAddress(entry_point));
6103   }
6104 
6105   bind(E);
6106 
6107 #ifdef _WIN64
6108   // restore stack pointer
6109   addq(rsp, frame::arg_reg_save_area_bytes);
6110 #endif
6111 
6112 }
6113 
6114 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
6115   assert(!src2.is_lval(), "should use cmpptr");
6116 
6117   if (reachable(src2)) {
6118     cmpq(src1, as_Address(src2));
6119   } else {
6120     lea(rscratch1, src2);
6121     Assembler::cmpq(src1, Address(rscratch1, 0));
6122   }
6123 }
6124 
6125 int MacroAssembler::corrected_idivq(Register reg) {
6126   // Full implementation of Java ldiv and lrem; checks for special
6127   // case as described in JVM spec., p.243 & p.271.  The function
6128   // returns the (pc) offset of the idivl instruction - may be needed
6129   // for implicit exceptions.
6130   //
6131   //         normal case                           special case
6132   //
6133   // input : rax: dividend                         min_long
6134   //         reg: divisor   (may not be eax/edx)   -1
6135   //
6136   // output: rax: quotient  (= rax idiv reg)       min_long
6137   //         rdx: remainder (= rax irem reg)       0
6138   assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
6139   static const int64_t min_long = 0x8000000000000000;
6140   Label normal_case, special_case;
6141 
6142   // check for special case
6143   cmp64(rax, ExternalAddress((address) &min_long));
6144   jcc(Assembler::notEqual, normal_case);
6145   xorl(rdx, rdx); // prepare rdx for possible special case (where
6146                   // remainder = 0)
6147   cmpq(reg, -1);
6148   jcc(Assembler::equal, special_case);
6149 
6150   // handle normal case
6151   bind(normal_case);
6152   cdqq();
6153   int idivq_offset = offset();
6154   idivq(reg);
6155 
6156   // normal and special case exit
6157   bind(special_case);
6158 
6159   return idivq_offset;
6160 }
6161 
6162 void MacroAssembler::decrementq(Register reg, int value) {
6163   if (value == min_jint) { subq(reg, value); return; }
6164   if (value <  0) { incrementq(reg, -value); return; }
6165   if (value == 0) {                        ; return; }
6166   if (value == 1 && UseIncDec) { decq(reg) ; return; }
6167   /* else */      { subq(reg, value)       ; return; }
6168 }
6169 
6170 void MacroAssembler::decrementq(Address dst, int value) {
6171   if (value == min_jint) { subq(dst, value); return; }
6172   if (value <  0) { incrementq(dst, -value); return; }
6173   if (value == 0) {                        ; return; }
6174   if (value == 1 && UseIncDec) { decq(dst) ; return; }
6175   /* else */      { subq(dst, value)       ; return; }
6176 }
6177 
6178 void MacroAssembler::incrementq(Register reg, int value) {
6179   if (value == min_jint) { addq(reg, value); return; }
6180   if (value <  0) { decrementq(reg, -value); return; }
6181   if (value == 0) {                        ; return; }
6182   if (value == 1 && UseIncDec) { incq(reg) ; return; }
6183   /* else */      { addq(reg, value)       ; return; }
6184 }
6185 
6186 void MacroAssembler::incrementq(Address dst, int value) {
6187   if (value == min_jint) { addq(dst, value); return; }
6188   if (value <  0) { decrementq(dst, -value); return; }
6189   if (value == 0) {                        ; return; }
6190   if (value == 1 && UseIncDec) { incq(dst) ; return; }
6191   /* else */      { addq(dst, value)       ; return; }
6192 }
6193 
6194 // 32bit can do a case table jump in one instruction but we no longer allow the base
6195 // to be installed in the Address class
6196 void MacroAssembler::jump(ArrayAddress entry) {
6197   lea(rscratch1, entry.base());
6198   Address dispatch = entry.index();
6199   assert(dispatch._base == noreg, "must be");
6200   dispatch._base = rscratch1;
6201   jmp(dispatch);
6202 }
6203 
6204 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
6205   ShouldNotReachHere(); // 64bit doesn't use two regs
6206   cmpq(x_lo, y_lo);
6207 }
6208 
6209 void MacroAssembler::lea(Register dst, AddressLiteral src) {
6210     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
6211 }
6212 
6213 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
6214   mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
6215   movptr(dst, rscratch1);
6216 }
6217 
6218 void MacroAssembler::leave() {
6219   // %%% is this really better? Why not on 32bit too?
6220   emit_byte(0xC9); // LEAVE
6221 }
6222 
6223 void MacroAssembler::lneg(Register hi, Register lo) {
6224   ShouldNotReachHere(); // 64bit doesn't use two regs
6225   negq(lo);
6226 }
6227 
6228 void MacroAssembler::movoop(Register dst, jobject obj) {
6229   mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
6230 }
6231 
6232 void MacroAssembler::movoop(Address dst, jobject obj) {
6233   mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
6234   movq(dst, rscratch1);
6235 }
6236 
6237 void MacroAssembler::movptr(Register dst, AddressLiteral src) {
6238   if (src.is_lval()) {
6239     mov_literal64(dst, (intptr_t)src.target(), src.rspec());
6240   } else {
6241     if (reachable(src)) {
6242       movq(dst, as_Address(src));
6243     } else {
6244       lea(rscratch1, src);
6245       movq(dst, Address(rscratch1,0));
6246     }
6247   }
6248 }
6249 
6250 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
6251   movq(as_Address(dst), src);
6252 }
6253 
6254 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
6255   movq(dst, as_Address(src));
6256 }
6257 
6258 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
6259 void MacroAssembler::movptr(Address dst, intptr_t src) {
6260   mov64(rscratch1, src);
6261   movq(dst, rscratch1);
6262 }
6263 
6264 // These are mostly for initializing NULL
6265 void MacroAssembler::movptr(Address dst, int32_t src) {
6266   movslq(dst, src);
6267 }
6268 
6269 void MacroAssembler::movptr(Register dst, int32_t src) {
6270   mov64(dst, (intptr_t)src);
6271 }
6272 
6273 void MacroAssembler::pushoop(jobject obj) {
6274   movoop(rscratch1, obj);
6275   push(rscratch1);
6276 }
6277 
6278 void MacroAssembler::pushptr(AddressLiteral src) {
6279   lea(rscratch1, src);
6280   if (src.is_lval()) {
6281     push(rscratch1);
6282   } else {
6283     pushq(Address(rscratch1, 0));
6284   }
6285 }
6286 
6287 void MacroAssembler::reset_last_Java_frame(bool clear_fp,
6288                                            bool clear_pc) {
6289   // we must set sp to zero to clear frame
6290   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
6291   // must clear fp, so that compiled frames are not confused; it is
6292   // possible that we need it only for debugging
6293   if (clear_fp) {
6294     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
6295   }
6296 
6297   if (clear_pc) {
6298     movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
6299   }
6300 }
6301 
6302 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
6303                                          Register last_java_fp,
6304                                          address  last_java_pc) {
6305   // determine last_java_sp register
6306   if (!last_java_sp->is_valid()) {
6307     last_java_sp = rsp;
6308   }
6309 
6310   // last_java_fp is optional
6311   if (last_java_fp->is_valid()) {
6312     movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
6313            last_java_fp);
6314   }
6315 
6316   // last_java_pc is optional
6317   if (last_java_pc != NULL) {
6318     Address java_pc(r15_thread,
6319                     JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
6320     lea(rscratch1, InternalAddress(last_java_pc));
6321     movptr(java_pc, rscratch1);
6322   }
6323 
6324   movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
6325 }
6326 
6327 static void pass_arg0(MacroAssembler* masm, Register arg) {
6328   if (c_rarg0 != arg ) {
6329     masm->mov(c_rarg0, arg);
6330   }
6331 }
6332 
6333 static void pass_arg1(MacroAssembler* masm, Register arg) {
6334   if (c_rarg1 != arg ) {
6335     masm->mov(c_rarg1, arg);
6336   }
6337 }
6338 
6339 static void pass_arg2(MacroAssembler* masm, Register arg) {
6340   if (c_rarg2 != arg ) {
6341     masm->mov(c_rarg2, arg);
6342   }
6343 }
6344 
6345 static void pass_arg3(MacroAssembler* masm, Register arg) {
6346   if (c_rarg3 != arg ) {
6347     masm->mov(c_rarg3, arg);
6348   }
6349 }
6350 
6351 void MacroAssembler::stop(const char* msg) {
6352   address rip = pc();
6353   pusha(); // get regs on stack
6354   lea(c_rarg0, ExternalAddress((address) msg));
6355   lea(c_rarg1, InternalAddress(rip));
6356   movq(c_rarg2, rsp); // pass pointer to regs array
6357   andq(rsp, -16); // align stack as required by ABI
6358   call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
6359   hlt();
6360 }
6361 
6362 void MacroAssembler::warn(const char* msg) {
6363   push(rsp);
6364   andq(rsp, -16);     // align stack as required by push_CPU_state and call
6365 
6366   push_CPU_state();   // keeps alignment at 16 bytes
6367   lea(c_rarg0, ExternalAddress((address) msg));
6368   call_VM_leaf(CAST_FROM_FN_PTR(address, warning), c_rarg0);
6369   pop_CPU_state();
6370   pop(rsp);
6371 }
6372 
6373 #ifndef PRODUCT
6374 extern "C" void findpc(intptr_t x);
6375 #endif
6376 
6377 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
6378   // In order to get locks to work, we need to fake a in_VM state
6379   if (ShowMessageBoxOnError ) {
6380     JavaThread* thread = JavaThread::current();
6381     JavaThreadState saved_state = thread->thread_state();
6382     thread->set_thread_state(_thread_in_vm);
6383 #ifndef PRODUCT
6384     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
6385       ttyLocker ttyl;
6386       BytecodeCounter::print();
6387     }
6388 #endif
6389     // To see where a verify_oop failed, get $ebx+40/X for this frame.
6390     // XXX correct this offset for amd64
6391     // This is the value of eip which points to where verify_oop will return.
6392     if (os::message_box(msg, "Execution stopped, print registers?")) {
6393       ttyLocker ttyl;
6394       tty->print_cr("rip = 0x%016lx", pc);
6395 #ifndef PRODUCT
6396       tty->cr();
6397       findpc(pc);
6398       tty->cr();
6399 #endif
6400       tty->print_cr("rax = 0x%016lx", regs[15]);
6401       tty->print_cr("rbx = 0x%016lx", regs[12]);
6402       tty->print_cr("rcx = 0x%016lx", regs[14]);
6403       tty->print_cr("rdx = 0x%016lx", regs[13]);
6404       tty->print_cr("rdi = 0x%016lx", regs[8]);
6405       tty->print_cr("rsi = 0x%016lx", regs[9]);
6406       tty->print_cr("rbp = 0x%016lx", regs[10]);
6407       tty->print_cr("rsp = 0x%016lx", regs[11]);
6408       tty->print_cr("r8  = 0x%016lx", regs[7]);
6409       tty->print_cr("r9  = 0x%016lx", regs[6]);
6410       tty->print_cr("r10 = 0x%016lx", regs[5]);
6411       tty->print_cr("r11 = 0x%016lx", regs[4]);
6412       tty->print_cr("r12 = 0x%016lx", regs[3]);
6413       tty->print_cr("r13 = 0x%016lx", regs[2]);
6414       tty->print_cr("r14 = 0x%016lx", regs[1]);
6415       tty->print_cr("r15 = 0x%016lx", regs[0]);
6416       BREAKPOINT;
6417     }
6418     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
6419   } else {
6420     ttyLocker ttyl;
6421     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
6422                     msg);
6423     assert(false, err_msg("DEBUG MESSAGE: %s", msg));
6424   }
6425 }
6426 
6427 #endif // _LP64
6428 
6429 // Now versions that are common to 32/64 bit
6430 
6431 void MacroAssembler::addptr(Register dst, int32_t imm32) {
6432   LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
6433 }
6434 
6435 void MacroAssembler::addptr(Register dst, Register src) {
6436   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
6437 }
6438 
6439 void MacroAssembler::addptr(Address dst, Register src) {
6440   LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
6441 }
6442 
6443 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
6444   if (reachable(src)) {
6445     Assembler::addsd(dst, as_Address(src));
6446   } else {
6447     lea(rscratch1, src);
6448     Assembler::addsd(dst, Address(rscratch1, 0));
6449   }
6450 }
6451 
6452 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
6453   if (reachable(src)) {
6454     addss(dst, as_Address(src));
6455   } else {
6456     lea(rscratch1, src);
6457     addss(dst, Address(rscratch1, 0));
6458   }
6459 }
6460 
6461 void MacroAssembler::align(int modulus) {
6462   if (offset() % modulus != 0) {
6463     nop(modulus - (offset() % modulus));
6464   }
6465 }
6466 
6467 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src) {
6468   // Used in sign-masking with aligned address.
6469   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
6470   if (reachable(src)) {
6471     Assembler::andpd(dst, as_Address(src));
6472   } else {
6473     lea(rscratch1, src);
6474     Assembler::andpd(dst, Address(rscratch1, 0));
6475   }
6476 }
6477 
6478 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src) {
6479   // Used in sign-masking with aligned address.
6480   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
6481   if (reachable(src)) {
6482     Assembler::andps(dst, as_Address(src));
6483   } else {
6484     lea(rscratch1, src);
6485     Assembler::andps(dst, Address(rscratch1, 0));
6486   }
6487 }
6488 
6489 void MacroAssembler::andptr(Register dst, int32_t imm32) {
6490   LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
6491 }
6492 
6493 void MacroAssembler::atomic_incl(AddressLiteral counter_addr) {
6494   pushf();
6495   if (os::is_MP())
6496     lock();
6497   incrementl(counter_addr);
6498   popf();
6499 }
6500 
6501 // Writes to stack successive pages until offset reached to check for
6502 // stack overflow + shadow pages.  This clobbers tmp.
6503 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
6504   movptr(tmp, rsp);
6505   // Bang stack for total size given plus shadow page size.
6506   // Bang one page at a time because large size can bang beyond yellow and
6507   // red zones.
6508   Label loop;
6509   bind(loop);
6510   movl(Address(tmp, (-os::vm_page_size())), size );
6511   subptr(tmp, os::vm_page_size());
6512   subl(size, os::vm_page_size());
6513   jcc(Assembler::greater, loop);
6514 
6515   // Bang down shadow pages too.
6516   // The -1 because we already subtracted 1 page.
6517   for (int i = 0; i< StackShadowPages-1; i++) {
6518     // this could be any sized move but this is can be a debugging crumb
6519     // so the bigger the better.
6520     movptr(Address(tmp, (-i*os::vm_page_size())), size );
6521   }
6522 }
6523 
6524 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
6525   assert(UseBiasedLocking, "why call this otherwise?");
6526 
6527   // Check for biased locking unlock case, which is a no-op
6528   // Note: we do not have to check the thread ID for two reasons.
6529   // First, the interpreter checks for IllegalMonitorStateException at
6530   // a higher level. Second, if the bias was revoked while we held the
6531   // lock, the object could not be rebiased toward another thread, so
6532   // the bias bit would be clear.
6533   movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
6534   andptr(temp_reg, markOopDesc::biased_lock_mask_in_place);
6535   cmpptr(temp_reg, markOopDesc::biased_lock_pattern);
6536   jcc(Assembler::equal, done);
6537 }
6538 
6539 void MacroAssembler::c2bool(Register x) {
6540   // implements x == 0 ? 0 : 1
6541   // note: must only look at least-significant byte of x
6542   //       since C-style booleans are stored in one byte
6543   //       only! (was bug)
6544   andl(x, 0xFF);
6545   setb(Assembler::notZero, x);
6546 }
6547 
6548 // Wouldn't need if AddressLiteral version had new name
6549 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
6550   Assembler::call(L, rtype);
6551 }
6552 
6553 void MacroAssembler::call(Register entry) {
6554   Assembler::call(entry);
6555 }
6556 
6557 void MacroAssembler::call(AddressLiteral entry) {
6558   if (reachable(entry)) {
6559     Assembler::call_literal(entry.target(), entry.rspec());
6560   } else {
6561     lea(rscratch1, entry);
6562     Assembler::call(rscratch1);
6563   }
6564 }
6565 
6566 // Implementation of call_VM versions
6567 
6568 void MacroAssembler::call_VM(Register oop_result,
6569                              address entry_point,
6570                              bool check_exceptions) {
6571   Label C, E;
6572   call(C, relocInfo::none);
6573   jmp(E);
6574 
6575   bind(C);
6576   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
6577   ret(0);
6578 
6579   bind(E);
6580 }
6581 
6582 void MacroAssembler::call_VM(Register oop_result,
6583                              address entry_point,
6584                              Register arg_1,
6585                              bool check_exceptions) {
6586   Label C, E;
6587   call(C, relocInfo::none);
6588   jmp(E);
6589 
6590   bind(C);
6591   pass_arg1(this, arg_1);
6592   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
6593   ret(0);
6594 
6595   bind(E);
6596 }
6597 
6598 void MacroAssembler::call_VM(Register oop_result,
6599                              address entry_point,
6600                              Register arg_1,
6601                              Register arg_2,
6602                              bool check_exceptions) {
6603   Label C, E;
6604   call(C, relocInfo::none);
6605   jmp(E);
6606 
6607   bind(C);
6608 
6609   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6610 
6611   pass_arg2(this, arg_2);
6612   pass_arg1(this, arg_1);
6613   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
6614   ret(0);
6615 
6616   bind(E);
6617 }
6618 
6619 void MacroAssembler::call_VM(Register oop_result,
6620                              address entry_point,
6621                              Register arg_1,
6622                              Register arg_2,
6623                              Register arg_3,
6624                              bool check_exceptions) {
6625   Label C, E;
6626   call(C, relocInfo::none);
6627   jmp(E);
6628 
6629   bind(C);
6630 
6631   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
6632   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
6633   pass_arg3(this, arg_3);
6634 
6635   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6636   pass_arg2(this, arg_2);
6637 
6638   pass_arg1(this, arg_1);
6639   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
6640   ret(0);
6641 
6642   bind(E);
6643 }
6644 
6645 void MacroAssembler::call_VM(Register oop_result,
6646                              Register last_java_sp,
6647                              address entry_point,
6648                              int number_of_arguments,
6649                              bool check_exceptions) {
6650   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
6651   call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
6652 }
6653 
6654 void MacroAssembler::call_VM(Register oop_result,
6655                              Register last_java_sp,
6656                              address entry_point,
6657                              Register arg_1,
6658                              bool check_exceptions) {
6659   pass_arg1(this, arg_1);
6660   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
6661 }
6662 
6663 void MacroAssembler::call_VM(Register oop_result,
6664                              Register last_java_sp,
6665                              address entry_point,
6666                              Register arg_1,
6667                              Register arg_2,
6668                              bool check_exceptions) {
6669 
6670   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6671   pass_arg2(this, arg_2);
6672   pass_arg1(this, arg_1);
6673   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
6674 }
6675 
6676 void MacroAssembler::call_VM(Register oop_result,
6677                              Register last_java_sp,
6678                              address entry_point,
6679                              Register arg_1,
6680                              Register arg_2,
6681                              Register arg_3,
6682                              bool check_exceptions) {
6683   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
6684   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
6685   pass_arg3(this, arg_3);
6686   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6687   pass_arg2(this, arg_2);
6688   pass_arg1(this, arg_1);
6689   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
6690 }
6691 
6692 void MacroAssembler::super_call_VM(Register oop_result,
6693                                    Register last_java_sp,
6694                                    address entry_point,
6695                                    int number_of_arguments,
6696                                    bool check_exceptions) {
6697   Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
6698   MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
6699 }
6700 
6701 void MacroAssembler::super_call_VM(Register oop_result,
6702                                    Register last_java_sp,
6703                                    address entry_point,
6704                                    Register arg_1,
6705                                    bool check_exceptions) {
6706   pass_arg1(this, arg_1);
6707   super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
6708 }
6709 
6710 void MacroAssembler::super_call_VM(Register oop_result,
6711                                    Register last_java_sp,
6712                                    address entry_point,
6713                                    Register arg_1,
6714                                    Register arg_2,
6715                                    bool check_exceptions) {
6716 
6717   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6718   pass_arg2(this, arg_2);
6719   pass_arg1(this, arg_1);
6720   super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
6721 }
6722 
6723 void MacroAssembler::super_call_VM(Register oop_result,
6724                                    Register last_java_sp,
6725                                    address entry_point,
6726                                    Register arg_1,
6727                                    Register arg_2,
6728                                    Register arg_3,
6729                                    bool check_exceptions) {
6730   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
6731   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
6732   pass_arg3(this, arg_3);
6733   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6734   pass_arg2(this, arg_2);
6735   pass_arg1(this, arg_1);
6736   super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
6737 }
6738 
6739 void MacroAssembler::call_VM_base(Register oop_result,
6740                                   Register java_thread,
6741                                   Register last_java_sp,
6742                                   address  entry_point,
6743                                   int      number_of_arguments,
6744                                   bool     check_exceptions) {
6745   // determine java_thread register
6746   if (!java_thread->is_valid()) {
6747 #ifdef _LP64
6748     java_thread = r15_thread;
6749 #else
6750     java_thread = rdi;
6751     get_thread(java_thread);
6752 #endif // LP64
6753   }
6754   // determine last_java_sp register
6755   if (!last_java_sp->is_valid()) {
6756     last_java_sp = rsp;
6757   }
6758   // debugging support
6759   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
6760   LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
6761 #ifdef ASSERT
6762   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
6763   // r12 is the heapbase.
6764   LP64_ONLY(if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base");)
6765 #endif // ASSERT
6766 
6767   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
6768   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
6769 
6770   // push java thread (becomes first argument of C function)
6771 
6772   NOT_LP64(push(java_thread); number_of_arguments++);
6773   LP64_ONLY(mov(c_rarg0, r15_thread));
6774 
6775   // set last Java frame before call
6776   assert(last_java_sp != rbp, "can't use ebp/rbp");
6777 
6778   // Only interpreter should have to set fp
6779   set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
6780 
6781   // do the call, remove parameters
6782   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
6783 
6784   // restore the thread (cannot use the pushed argument since arguments
6785   // may be overwritten by C code generated by an optimizing compiler);
6786   // however can use the register value directly if it is callee saved.
6787   if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
6788     // rdi & rsi (also r15) are callee saved -> nothing to do
6789 #ifdef ASSERT
6790     guarantee(java_thread != rax, "change this code");
6791     push(rax);
6792     { Label L;
6793       get_thread(rax);
6794       cmpptr(java_thread, rax);
6795       jcc(Assembler::equal, L);
6796       stop("MacroAssembler::call_VM_base: rdi not callee saved?");
6797       bind(L);
6798     }
6799     pop(rax);
6800 #endif
6801   } else {
6802     get_thread(java_thread);
6803   }
6804   // reset last Java frame
6805   // Only interpreter should have to clear fp
6806   reset_last_Java_frame(java_thread, true, false);
6807 
6808 #ifndef CC_INTERP
6809    // C++ interp handles this in the interpreter
6810   check_and_handle_popframe(java_thread);
6811   check_and_handle_earlyret(java_thread);
6812 #endif /* CC_INTERP */
6813 
6814   if (check_exceptions) {
6815     // check for pending exceptions (java_thread is set upon return)
6816     cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
6817 #ifndef _LP64
6818     jump_cc(Assembler::notEqual,
6819             RuntimeAddress(StubRoutines::forward_exception_entry()));
6820 #else
6821     // This used to conditionally jump to forward_exception however it is
6822     // possible if we relocate that the branch will not reach. So we must jump
6823     // around so we can always reach
6824 
6825     Label ok;
6826     jcc(Assembler::equal, ok);
6827     jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
6828     bind(ok);
6829 #endif // LP64
6830   }
6831 
6832   // get oop result if there is one and reset the value in the thread
6833   if (oop_result->is_valid()) {
6834     movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
6835     movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
6836     verify_oop(oop_result, "broken oop in call_VM_base");
6837   }
6838 }
6839 
6840 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
6841 
6842   // Calculate the value for last_Java_sp
6843   // somewhat subtle. call_VM does an intermediate call
6844   // which places a return address on the stack just under the
6845   // stack pointer as the user finsihed with it. This allows
6846   // use to retrieve last_Java_pc from last_Java_sp[-1].
6847   // On 32bit we then have to push additional args on the stack to accomplish
6848   // the actual requested call. On 64bit call_VM only can use register args
6849   // so the only extra space is the return address that call_VM created.
6850   // This hopefully explains the calculations here.
6851 
6852 #ifdef _LP64
6853   // We've pushed one address, correct last_Java_sp
6854   lea(rax, Address(rsp, wordSize));
6855 #else
6856   lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
6857 #endif // LP64
6858 
6859   call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
6860 
6861 }
6862 
6863 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
6864   call_VM_leaf_base(entry_point, number_of_arguments);
6865 }
6866 
6867 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
6868   pass_arg0(this, arg_0);
6869   call_VM_leaf(entry_point, 1);
6870 }
6871 
6872 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
6873 
6874   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
6875   pass_arg1(this, arg_1);
6876   pass_arg0(this, arg_0);
6877   call_VM_leaf(entry_point, 2);
6878 }
6879 
6880 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
6881   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
6882   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6883   pass_arg2(this, arg_2);
6884   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
6885   pass_arg1(this, arg_1);
6886   pass_arg0(this, arg_0);
6887   call_VM_leaf(entry_point, 3);
6888 }
6889 
6890 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
6891   pass_arg0(this, arg_0);
6892   MacroAssembler::call_VM_leaf_base(entry_point, 1);
6893 }
6894 
6895 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
6896 
6897   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
6898   pass_arg1(this, arg_1);
6899   pass_arg0(this, arg_0);
6900   MacroAssembler::call_VM_leaf_base(entry_point, 2);
6901 }
6902 
6903 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
6904   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
6905   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6906   pass_arg2(this, arg_2);
6907   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
6908   pass_arg1(this, arg_1);
6909   pass_arg0(this, arg_0);
6910   MacroAssembler::call_VM_leaf_base(entry_point, 3);
6911 }
6912 
6913 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
6914   LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
6915   LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
6916   LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
6917   pass_arg3(this, arg_3);
6918   LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
6919   LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
6920   pass_arg2(this, arg_2);
6921   LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
6922   pass_arg1(this, arg_1);
6923   pass_arg0(this, arg_0);
6924   MacroAssembler::call_VM_leaf_base(entry_point, 4);
6925 }
6926 
6927 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
6928 }
6929 
6930 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
6931 }
6932 
6933 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
6934   if (reachable(src1)) {
6935     cmpl(as_Address(src1), imm);
6936   } else {
6937     lea(rscratch1, src1);
6938     cmpl(Address(rscratch1, 0), imm);
6939   }
6940 }
6941 
6942 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
6943   assert(!src2.is_lval(), "use cmpptr");
6944   if (reachable(src2)) {
6945     cmpl(src1, as_Address(src2));
6946   } else {
6947     lea(rscratch1, src2);
6948     cmpl(src1, Address(rscratch1, 0));
6949   }
6950 }
6951 
6952 void MacroAssembler::cmp32(Register src1, int32_t imm) {
6953   Assembler::cmpl(src1, imm);
6954 }
6955 
6956 void MacroAssembler::cmp32(Register src1, Address src2) {
6957   Assembler::cmpl(src1, src2);
6958 }
6959 
6960 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
6961   ucomisd(opr1, opr2);
6962 
6963   Label L;
6964   if (unordered_is_less) {
6965     movl(dst, -1);
6966     jcc(Assembler::parity, L);
6967     jcc(Assembler::below , L);
6968     movl(dst, 0);
6969     jcc(Assembler::equal , L);
6970     increment(dst);
6971   } else { // unordered is greater
6972     movl(dst, 1);
6973     jcc(Assembler::parity, L);
6974     jcc(Assembler::above , L);
6975     movl(dst, 0);
6976     jcc(Assembler::equal , L);
6977     decrementl(dst);
6978   }
6979   bind(L);
6980 }
6981 
6982 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
6983   ucomiss(opr1, opr2);
6984 
6985   Label L;
6986   if (unordered_is_less) {
6987     movl(dst, -1);
6988     jcc(Assembler::parity, L);
6989     jcc(Assembler::below , L);
6990     movl(dst, 0);
6991     jcc(Assembler::equal , L);
6992     increment(dst);
6993   } else { // unordered is greater
6994     movl(dst, 1);
6995     jcc(Assembler::parity, L);
6996     jcc(Assembler::above , L);
6997     movl(dst, 0);
6998     jcc(Assembler::equal , L);
6999     decrementl(dst);
7000   }
7001   bind(L);
7002 }
7003 
7004 
7005 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
7006   if (reachable(src1)) {
7007     cmpb(as_Address(src1), imm);
7008   } else {
7009     lea(rscratch1, src1);
7010     cmpb(Address(rscratch1, 0), imm);
7011   }
7012 }
7013 
7014 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
7015 #ifdef _LP64
7016   if (src2.is_lval()) {
7017     movptr(rscratch1, src2);
7018     Assembler::cmpq(src1, rscratch1);
7019   } else if (reachable(src2)) {
7020     cmpq(src1, as_Address(src2));
7021   } else {
7022     lea(rscratch1, src2);
7023     Assembler::cmpq(src1, Address(rscratch1, 0));
7024   }
7025 #else
7026   if (src2.is_lval()) {
7027     cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
7028   } else {
7029     cmpl(src1, as_Address(src2));
7030   }
7031 #endif // _LP64
7032 }
7033 
7034 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
7035   assert(src2.is_lval(), "not a mem-mem compare");
7036 #ifdef _LP64
7037   // moves src2's literal address
7038   movptr(rscratch1, src2);
7039   Assembler::cmpq(src1, rscratch1);
7040 #else
7041   cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
7042 #endif // _LP64
7043 }
7044 
7045 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
7046   if (reachable(adr)) {
7047     if (os::is_MP())
7048       lock();
7049     cmpxchgptr(reg, as_Address(adr));
7050   } else {
7051     lea(rscratch1, adr);
7052     if (os::is_MP())
7053       lock();
7054     cmpxchgptr(reg, Address(rscratch1, 0));
7055   }
7056 }
7057 
7058 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
7059   LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
7060 }
7061 
7062 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
7063   if (reachable(src)) {
7064     Assembler::comisd(dst, as_Address(src));
7065   } else {
7066     lea(rscratch1, src);
7067     Assembler::comisd(dst, Address(rscratch1, 0));
7068   }
7069 }
7070 
7071 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
7072   if (reachable(src)) {
7073     Assembler::comiss(dst, as_Address(src));
7074   } else {
7075     lea(rscratch1, src);
7076     Assembler::comiss(dst, Address(rscratch1, 0));
7077   }
7078 }
7079 
7080 
7081 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
7082   Condition negated_cond = negate_condition(cond);
7083   Label L;
7084   jcc(negated_cond, L);
7085   atomic_incl(counter_addr);
7086   bind(L);
7087 }
7088 
7089 int MacroAssembler::corrected_idivl(Register reg) {
7090   // Full implementation of Java idiv and irem; checks for
7091   // special case as described in JVM spec., p.243 & p.271.
7092   // The function returns the (pc) offset of the idivl
7093   // instruction - may be needed for implicit exceptions.
7094   //
7095   //         normal case                           special case
7096   //
7097   // input : rax,: dividend                         min_int
7098   //         reg: divisor   (may not be rax,/rdx)   -1
7099   //
7100   // output: rax,: quotient  (= rax, idiv reg)       min_int
7101   //         rdx: remainder (= rax, irem reg)       0
7102   assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
7103   const int min_int = 0x80000000;
7104   Label normal_case, special_case;
7105 
7106   // check for special case
7107   cmpl(rax, min_int);
7108   jcc(Assembler::notEqual, normal_case);
7109   xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
7110   cmpl(reg, -1);
7111   jcc(Assembler::equal, special_case);
7112 
7113   // handle normal case
7114   bind(normal_case);
7115   cdql();
7116   int idivl_offset = offset();
7117   idivl(reg);
7118 
7119   // normal and special case exit
7120   bind(special_case);
7121 
7122   return idivl_offset;
7123 }
7124 
7125 
7126 
7127 void MacroAssembler::decrementl(Register reg, int value) {
7128   if (value == min_jint) {subl(reg, value) ; return; }
7129   if (value <  0) { incrementl(reg, -value); return; }
7130   if (value == 0) {                        ; return; }
7131   if (value == 1 && UseIncDec) { decl(reg) ; return; }
7132   /* else */      { subl(reg, value)       ; return; }
7133 }
7134 
7135 void MacroAssembler::decrementl(Address dst, int value) {
7136   if (value == min_jint) {subl(dst, value) ; return; }
7137   if (value <  0) { incrementl(dst, -value); return; }
7138   if (value == 0) {                        ; return; }
7139   if (value == 1 && UseIncDec) { decl(dst) ; return; }
7140   /* else */      { subl(dst, value)       ; return; }
7141 }
7142 
7143 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
7144   assert (shift_value > 0, "illegal shift value");
7145   Label _is_positive;
7146   testl (reg, reg);
7147   jcc (Assembler::positive, _is_positive);
7148   int offset = (1 << shift_value) - 1 ;
7149 
7150   if (offset == 1) {
7151     incrementl(reg);
7152   } else {
7153     addl(reg, offset);
7154   }
7155 
7156   bind (_is_positive);
7157   sarl(reg, shift_value);
7158 }
7159 
7160 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
7161   if (reachable(src)) {
7162     Assembler::divsd(dst, as_Address(src));
7163   } else {
7164     lea(rscratch1, src);
7165     Assembler::divsd(dst, Address(rscratch1, 0));
7166   }
7167 }
7168 
7169 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
7170   if (reachable(src)) {
7171     Assembler::divss(dst, as_Address(src));
7172   } else {
7173     lea(rscratch1, src);
7174     Assembler::divss(dst, Address(rscratch1, 0));
7175   }
7176 }
7177 
7178 // !defined(COMPILER2) is because of stupid core builds
7179 #if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2)
7180 void MacroAssembler::empty_FPU_stack() {
7181   if (VM_Version::supports_mmx()) {
7182     emms();
7183   } else {
7184     for (int i = 8; i-- > 0; ) ffree(i);
7185   }
7186 }
7187 #endif // !LP64 || C1 || !C2
7188 
7189 
7190 // Defines obj, preserves var_size_in_bytes
7191 void MacroAssembler::eden_allocate(Register obj,
7192                                    Register var_size_in_bytes,
7193                                    int con_size_in_bytes,
7194                                    Register t1,
7195                                    Label& slow_case) {
7196   assert(obj == rax, "obj must be in rax, for cmpxchg");
7197   assert_different_registers(obj, var_size_in_bytes, t1);
7198   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
7199     jmp(slow_case);
7200   } else {
7201     Register end = t1;
7202     Label retry;
7203     bind(retry);
7204     ExternalAddress heap_top((address) Universe::heap()->top_addr());
7205     movptr(obj, heap_top);
7206     if (var_size_in_bytes == noreg) {
7207       lea(end, Address(obj, con_size_in_bytes));
7208     } else {
7209       lea(end, Address(obj, var_size_in_bytes, Address::times_1));
7210     }
7211     // if end < obj then we wrapped around => object too long => slow case
7212     cmpptr(end, obj);
7213     jcc(Assembler::below, slow_case);
7214     cmpptr(end, ExternalAddress((address) Universe::heap()->end_addr()));
7215     jcc(Assembler::above, slow_case);
7216     // Compare obj with the top addr, and if still equal, store the new top addr in
7217     // end at the address of the top addr pointer. Sets ZF if was equal, and clears
7218     // it otherwise. Use lock prefix for atomicity on MPs.
7219     locked_cmpxchgptr(end, heap_top);
7220     jcc(Assembler::notEqual, retry);
7221   }
7222 }
7223 
7224 void MacroAssembler::enter() {
7225   push(rbp);
7226   mov(rbp, rsp);
7227 }
7228 
7229 // A 5 byte nop that is safe for patching (see patch_verified_entry)
7230 void MacroAssembler::fat_nop() {
7231   if (UseAddressNop) {
7232     addr_nop_5();
7233   } else {
7234     emit_byte(0x26); // es:
7235     emit_byte(0x2e); // cs:
7236     emit_byte(0x64); // fs:
7237     emit_byte(0x65); // gs:
7238     emit_byte(0x90);
7239   }
7240 }
7241 
7242 void MacroAssembler::fcmp(Register tmp) {
7243   fcmp(tmp, 1, true, true);
7244 }
7245 
7246 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
7247   assert(!pop_right || pop_left, "usage error");
7248   if (VM_Version::supports_cmov()) {
7249     assert(tmp == noreg, "unneeded temp");
7250     if (pop_left) {
7251       fucomip(index);
7252     } else {
7253       fucomi(index);
7254     }
7255     if (pop_right) {
7256       fpop();
7257     }
7258   } else {
7259     assert(tmp != noreg, "need temp");
7260     if (pop_left) {
7261       if (pop_right) {
7262         fcompp();
7263       } else {
7264         fcomp(index);
7265       }
7266     } else {
7267       fcom(index);
7268     }
7269     // convert FPU condition into eflags condition via rax,
7270     save_rax(tmp);
7271     fwait(); fnstsw_ax();
7272     sahf();
7273     restore_rax(tmp);
7274   }
7275   // condition codes set as follows:
7276   //
7277   // CF (corresponds to C0) if x < y
7278   // PF (corresponds to C2) if unordered
7279   // ZF (corresponds to C3) if x = y
7280 }
7281 
7282 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
7283   fcmp2int(dst, unordered_is_less, 1, true, true);
7284 }
7285 
7286 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
7287   fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
7288   Label L;
7289   if (unordered_is_less) {
7290     movl(dst, -1);
7291     jcc(Assembler::parity, L);
7292     jcc(Assembler::below , L);
7293     movl(dst, 0);
7294     jcc(Assembler::equal , L);
7295     increment(dst);
7296   } else { // unordered is greater
7297     movl(dst, 1);
7298     jcc(Assembler::parity, L);
7299     jcc(Assembler::above , L);
7300     movl(dst, 0);
7301     jcc(Assembler::equal , L);
7302     decrementl(dst);
7303   }
7304   bind(L);
7305 }
7306 
7307 void MacroAssembler::fld_d(AddressLiteral src) {
7308   fld_d(as_Address(src));
7309 }
7310 
7311 void MacroAssembler::fld_s(AddressLiteral src) {
7312   fld_s(as_Address(src));
7313 }
7314 
7315 void MacroAssembler::fld_x(AddressLiteral src) {
7316   Assembler::fld_x(as_Address(src));
7317 }
7318 
7319 void MacroAssembler::fldcw(AddressLiteral src) {
7320   Assembler::fldcw(as_Address(src));
7321 }
7322 
7323 void MacroAssembler::pow_exp_core_encoding() {
7324   // kills rax, rcx, rdx
7325   subptr(rsp,sizeof(jdouble));
7326   // computes 2^X. Stack: X ...
7327   // f2xm1 computes 2^X-1 but only operates on -1<=X<=1. Get int(X) and
7328   // keep it on the thread's stack to compute 2^int(X) later
7329   // then compute 2^(X-int(X)) as (2^(X-int(X)-1+1)
7330   // final result is obtained with: 2^X = 2^int(X) * 2^(X-int(X))
7331   fld_s(0);                 // Stack: X X ...
7332   frndint();                // Stack: int(X) X ...
7333   fsuba(1);                 // Stack: int(X) X-int(X) ...
7334   fistp_s(Address(rsp,0));  // move int(X) as integer to thread's stack. Stack: X-int(X) ...
7335   f2xm1();                  // Stack: 2^(X-int(X))-1 ...
7336   fld1();                   // Stack: 1 2^(X-int(X))-1 ...
7337   faddp(1);                 // Stack: 2^(X-int(X))
7338   // computes 2^(int(X)): add exponent bias (1023) to int(X), then
7339   // shift int(X)+1023 to exponent position.
7340   // Exponent is limited to 11 bits if int(X)+1023 does not fit in 11
7341   // bits, set result to NaN. 0x000 and 0x7FF are reserved exponent
7342   // values so detect them and set result to NaN.
7343   movl(rax,Address(rsp,0));
7344   movl(rcx, -2048); // 11 bit mask and valid NaN binary encoding
7345   addl(rax, 1023);
7346   movl(rdx,rax);
7347   shll(rax,20);
7348   // Check that 0 < int(X)+1023 < 2047. Otherwise set rax to NaN.
7349   addl(rdx,1);
7350   // Check that 1 < int(X)+1023+1 < 2048
7351   // in 3 steps:
7352   // 1- (int(X)+1023+1)&-2048 == 0 => 0 <= int(X)+1023+1 < 2048
7353   // 2- (int(X)+1023+1)&-2048 != 0
7354   // 3- (int(X)+1023+1)&-2048 != 1
7355   // Do 2- first because addl just updated the flags.
7356   cmov32(Assembler::equal,rax,rcx);
7357   cmpl(rdx,1);
7358   cmov32(Assembler::equal,rax,rcx);
7359   testl(rdx,rcx);
7360   cmov32(Assembler::notEqual,rax,rcx);
7361   movl(Address(rsp,4),rax);
7362   movl(Address(rsp,0),0);
7363   fmul_d(Address(rsp,0));   // Stack: 2^X ...
7364   addptr(rsp,sizeof(jdouble));
7365 }
7366 
7367 void MacroAssembler::increase_precision() {
7368   subptr(rsp, BytesPerWord);
7369   fnstcw(Address(rsp, 0));
7370   movl(rax, Address(rsp, 0));
7371   orl(rax, 0x300);
7372   push(rax);
7373   fldcw(Address(rsp, 0));
7374   pop(rax);
7375 }
7376 
7377 void MacroAssembler::restore_precision() {
7378   fldcw(Address(rsp, 0));
7379   addptr(rsp, BytesPerWord);
7380 }
7381 
7382 void MacroAssembler::fast_pow() {
7383   // computes X^Y = 2^(Y * log2(X))
7384   // if fast computation is not possible, result is NaN. Requires
7385   // fallback from user of this macro.
7386   // increase precision for intermediate steps of the computation
7387   increase_precision();
7388   fyl2x();                 // Stack: (Y*log2(X)) ...
7389   pow_exp_core_encoding(); // Stack: exp(X) ...
7390   restore_precision();
7391 }
7392 
7393 void MacroAssembler::fast_exp() {
7394   // computes exp(X) = 2^(X * log2(e))
7395   // if fast computation is not possible, result is NaN. Requires
7396   // fallback from user of this macro.
7397   // increase precision for intermediate steps of the computation
7398   increase_precision();
7399   fldl2e();                // Stack: log2(e) X ...
7400   fmulp(1);                // Stack: (X*log2(e)) ...
7401   pow_exp_core_encoding(); // Stack: exp(X) ...
7402   restore_precision();
7403 }
7404 
7405 void MacroAssembler::pow_or_exp(bool is_exp, int num_fpu_regs_in_use) {
7406   // kills rax, rcx, rdx
7407   // pow and exp needs 2 extra registers on the fpu stack.
7408   Label slow_case, done;
7409   Register tmp = noreg;
7410   if (!VM_Version::supports_cmov()) {
7411     // fcmp needs a temporary so preserve rdx,
7412     tmp = rdx;
7413   }
7414   Register tmp2 = rax;
7415   Register tmp3 = rcx;
7416 
7417   if (is_exp) {
7418     // Stack: X
7419     fld_s(0);                   // duplicate argument for runtime call. Stack: X X
7420     fast_exp();                 // Stack: exp(X) X
7421     fcmp(tmp, 0, false, false); // Stack: exp(X) X
7422     // exp(X) not equal to itself: exp(X) is NaN go to slow case.
7423     jcc(Assembler::parity, slow_case);
7424     // get rid of duplicate argument. Stack: exp(X)
7425     if (num_fpu_regs_in_use > 0) {
7426       fxch();
7427       fpop();
7428     } else {
7429       ffree(1);
7430     }
7431     jmp(done);
7432   } else {
7433     // Stack: X Y
7434     Label x_negative, y_odd;
7435 
7436     fldz();                     // Stack: 0 X Y
7437     fcmp(tmp, 1, true, false);  // Stack: X Y
7438     jcc(Assembler::above, x_negative);
7439 
7440     // X >= 0
7441 
7442     fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
7443     fld_s(1);                   // Stack: X Y X Y
7444     fast_pow();                 // Stack: X^Y X Y
7445     fcmp(tmp, 0, false, false); // Stack: X^Y X Y
7446     // X^Y not equal to itself: X^Y is NaN go to slow case.
7447     jcc(Assembler::parity, slow_case);
7448     // get rid of duplicate arguments. Stack: X^Y
7449     if (num_fpu_regs_in_use > 0) {
7450       fxch(); fpop();
7451       fxch(); fpop();
7452     } else {
7453       ffree(2);
7454       ffree(1);
7455     }
7456     jmp(done);
7457 
7458     // X <= 0
7459     bind(x_negative);
7460 
7461     fld_s(1);                   // Stack: Y X Y
7462     frndint();                  // Stack: int(Y) X Y
7463     fcmp(tmp, 2, false, false); // Stack: int(Y) X Y
7464     jcc(Assembler::notEqual, slow_case);
7465 
7466     subptr(rsp, 8);
7467 
7468     // For X^Y, when X < 0, Y has to be an integer and the final
7469     // result depends on whether it's odd or even. We just checked
7470     // that int(Y) == Y.  We move int(Y) to gp registers as a 64 bit
7471     // integer to test its parity. If int(Y) is huge and doesn't fit
7472     // in the 64 bit integer range, the integer indefinite value will
7473     // end up in the gp registers. Huge numbers are all even, the
7474     // integer indefinite number is even so it's fine.
7475 
7476 #ifdef ASSERT
7477     // Let's check we don't end up with an integer indefinite number
7478     // when not expected. First test for huge numbers: check whether
7479     // int(Y)+1 == int(Y) which is true for very large numbers and
7480     // those are all even. A 64 bit integer is guaranteed to not
7481     // overflow for numbers where y+1 != y (when precision is set to
7482     // double precision).
7483     Label y_not_huge;
7484 
7485     fld1();                     // Stack: 1 int(Y) X Y
7486     fadd(1);                    // Stack: 1+int(Y) int(Y) X Y
7487 
7488 #ifdef _LP64
7489     // trip to memory to force the precision down from double extended
7490     // precision
7491     fstp_d(Address(rsp, 0));
7492     fld_d(Address(rsp, 0));
7493 #endif
7494 
7495     fcmp(tmp, 1, true, false);  // Stack: int(Y) X Y
7496 #endif
7497 
7498     // move int(Y) as 64 bit integer to thread's stack
7499     fistp_d(Address(rsp,0));    // Stack: X Y
7500 
7501 #ifdef ASSERT
7502     jcc(Assembler::notEqual, y_not_huge);
7503 
7504     // Y is huge so we know it's even. It may not fit in a 64 bit
7505     // integer and we don't want the debug code below to see the
7506     // integer indefinite value so overwrite int(Y) on the thread's
7507     // stack with 0.
7508     movl(Address(rsp, 0), 0);
7509     movl(Address(rsp, 4), 0);
7510 
7511     bind(y_not_huge);
7512 #endif
7513 
7514     fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
7515     fld_s(1);                   // Stack: X Y X Y
7516     fabs();                     // Stack: abs(X) Y X Y
7517     fast_pow();                 // Stack: abs(X)^Y X Y
7518     fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y
7519     // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case.
7520 
7521     pop(tmp2);
7522     NOT_LP64(pop(tmp3));
7523     jcc(Assembler::parity, slow_case);
7524 
7525 #ifdef ASSERT
7526     // Check that int(Y) is not integer indefinite value (int
7527     // overflow). Shouldn't happen because for values that would
7528     // overflow, 1+int(Y)==Y which was tested earlier.
7529 #ifndef _LP64
7530     {
7531       Label integer;
7532       testl(tmp2, tmp2);
7533       jcc(Assembler::notZero, integer);
7534       cmpl(tmp3, 0x80000000);
7535       jcc(Assembler::notZero, integer);
7536       stop("integer indefinite value shouldn't be seen here");
7537       bind(integer);
7538     }
7539 #else
7540     {
7541       Label integer;
7542       mov(tmp3, tmp2); // preserve tmp2 for parity check below
7543       shlq(tmp3, 1);
7544       jcc(Assembler::carryClear, integer);
7545       jcc(Assembler::notZero, integer);
7546       stop("integer indefinite value shouldn't be seen here");
7547       bind(integer);
7548     }
7549 #endif
7550 #endif
7551 
7552     // get rid of duplicate arguments. Stack: X^Y
7553     if (num_fpu_regs_in_use > 0) {
7554       fxch(); fpop();
7555       fxch(); fpop();
7556     } else {
7557       ffree(2);
7558       ffree(1);
7559     }
7560 
7561     testl(tmp2, 1);
7562     jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y
7563     // X <= 0, Y even: X^Y = -abs(X)^Y
7564 
7565     fchs();                     // Stack: -abs(X)^Y Y
7566     jmp(done);
7567   }
7568 
7569   // slow case: runtime call
7570   bind(slow_case);
7571 
7572   fpop();                       // pop incorrect result or int(Y)
7573 
7574   fp_runtime_fallback(is_exp ? CAST_FROM_FN_PTR(address, SharedRuntime::dexp) : CAST_FROM_FN_PTR(address, SharedRuntime::dpow),
7575                       is_exp ? 1 : 2, num_fpu_regs_in_use);
7576 
7577   // Come here with result in F-TOS
7578   bind(done);
7579 }
7580 
7581 void MacroAssembler::fpop() {
7582   ffree();
7583   fincstp();
7584 }
7585 
7586 void MacroAssembler::fremr(Register tmp) {
7587   save_rax(tmp);
7588   { Label L;
7589     bind(L);
7590     fprem();
7591     fwait(); fnstsw_ax();
7592 #ifdef _LP64
7593     testl(rax, 0x400);
7594     jcc(Assembler::notEqual, L);
7595 #else
7596     sahf();
7597     jcc(Assembler::parity, L);
7598 #endif // _LP64
7599   }
7600   restore_rax(tmp);
7601   // Result is in ST0.
7602   // Note: fxch & fpop to get rid of ST1
7603   // (otherwise FPU stack could overflow eventually)
7604   fxch(1);
7605   fpop();
7606 }
7607 
7608 
7609 void MacroAssembler::incrementl(AddressLiteral dst) {
7610   if (reachable(dst)) {
7611     incrementl(as_Address(dst));
7612   } else {
7613     lea(rscratch1, dst);
7614     incrementl(Address(rscratch1, 0));
7615   }
7616 }
7617 
7618 void MacroAssembler::incrementl(ArrayAddress dst) {
7619   incrementl(as_Address(dst));
7620 }
7621 
7622 void MacroAssembler::incrementl(Register reg, int value) {
7623   if (value == min_jint) {addl(reg, value) ; return; }
7624   if (value <  0) { decrementl(reg, -value); return; }
7625   if (value == 0) {                        ; return; }
7626   if (value == 1 && UseIncDec) { incl(reg) ; return; }
7627   /* else */      { addl(reg, value)       ; return; }
7628 }
7629 
7630 void MacroAssembler::incrementl(Address dst, int value) {
7631   if (value == min_jint) {addl(dst, value) ; return; }
7632   if (value <  0) { decrementl(dst, -value); return; }
7633   if (value == 0) {                        ; return; }
7634   if (value == 1 && UseIncDec) { incl(dst) ; return; }
7635   /* else */      { addl(dst, value)       ; return; }
7636 }
7637 
7638 void MacroAssembler::jump(AddressLiteral dst) {
7639   if (reachable(dst)) {
7640     jmp_literal(dst.target(), dst.rspec());
7641   } else {
7642     lea(rscratch1, dst);
7643     jmp(rscratch1);
7644   }
7645 }
7646 
7647 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
7648   if (reachable(dst)) {
7649     InstructionMark im(this);
7650     relocate(dst.reloc());
7651     const int short_size = 2;
7652     const int long_size = 6;
7653     int offs = (intptr_t)dst.target() - ((intptr_t)_code_pos);
7654     if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
7655       // 0111 tttn #8-bit disp
7656       emit_byte(0x70 | cc);
7657       emit_byte((offs - short_size) & 0xFF);
7658     } else {
7659       // 0000 1111 1000 tttn #32-bit disp
7660       emit_byte(0x0F);
7661       emit_byte(0x80 | cc);
7662       emit_long(offs - long_size);
7663     }
7664   } else {
7665 #ifdef ASSERT
7666     warning("reversing conditional branch");
7667 #endif /* ASSERT */
7668     Label skip;
7669     jccb(reverse[cc], skip);
7670     lea(rscratch1, dst);
7671     Assembler::jmp(rscratch1);
7672     bind(skip);
7673   }
7674 }
7675 
7676 void MacroAssembler::ldmxcsr(AddressLiteral src) {
7677   if (reachable(src)) {
7678     Assembler::ldmxcsr(as_Address(src));
7679   } else {
7680     lea(rscratch1, src);
7681     Assembler::ldmxcsr(Address(rscratch1, 0));
7682   }
7683 }
7684 
7685 int MacroAssembler::load_signed_byte(Register dst, Address src) {
7686   int off;
7687   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
7688     off = offset();
7689     movsbl(dst, src); // movsxb
7690   } else {
7691     off = load_unsigned_byte(dst, src);
7692     shll(dst, 24);
7693     sarl(dst, 24);
7694   }
7695   return off;
7696 }
7697 
7698 // Note: load_signed_short used to be called load_signed_word.
7699 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
7700 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
7701 // The term "word" in HotSpot means a 32- or 64-bit machine word.
7702 int MacroAssembler::load_signed_short(Register dst, Address src) {
7703   int off;
7704   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
7705     // This is dubious to me since it seems safe to do a signed 16 => 64 bit
7706     // version but this is what 64bit has always done. This seems to imply
7707     // that users are only using 32bits worth.
7708     off = offset();
7709     movswl(dst, src); // movsxw
7710   } else {
7711     off = load_unsigned_short(dst, src);
7712     shll(dst, 16);
7713     sarl(dst, 16);
7714   }
7715   return off;
7716 }
7717 
7718 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
7719   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
7720   // and "3.9 Partial Register Penalties", p. 22).
7721   int off;
7722   if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
7723     off = offset();
7724     movzbl(dst, src); // movzxb
7725   } else {
7726     xorl(dst, dst);
7727     off = offset();
7728     movb(dst, src);
7729   }
7730   return off;
7731 }
7732 
7733 // Note: load_unsigned_short used to be called load_unsigned_word.
7734 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
7735   // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
7736   // and "3.9 Partial Register Penalties", p. 22).
7737   int off;
7738   if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
7739     off = offset();
7740     movzwl(dst, src); // movzxw
7741   } else {
7742     xorl(dst, dst);
7743     off = offset();
7744     movw(dst, src);
7745   }
7746   return off;
7747 }
7748 
7749 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
7750   switch (size_in_bytes) {
7751 #ifndef _LP64
7752   case  8:
7753     assert(dst2 != noreg, "second dest register required");
7754     movl(dst,  src);
7755     movl(dst2, src.plus_disp(BytesPerInt));
7756     break;
7757 #else
7758   case  8:  movq(dst, src); break;
7759 #endif
7760   case  4:  movl(dst, src); break;
7761   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
7762   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
7763   default:  ShouldNotReachHere();
7764   }
7765 }
7766 
7767 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
7768   switch (size_in_bytes) {
7769 #ifndef _LP64
7770   case  8:
7771     assert(src2 != noreg, "second source register required");
7772     movl(dst,                        src);
7773     movl(dst.plus_disp(BytesPerInt), src2);
7774     break;
7775 #else
7776   case  8:  movq(dst, src); break;
7777 #endif
7778   case  4:  movl(dst, src); break;
7779   case  2:  movw(dst, src); break;
7780   case  1:  movb(dst, src); break;
7781   default:  ShouldNotReachHere();
7782   }
7783 }
7784 
7785 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
7786   if (reachable(dst)) {
7787     movl(as_Address(dst), src);
7788   } else {
7789     lea(rscratch1, dst);
7790     movl(Address(rscratch1, 0), src);
7791   }
7792 }
7793 
7794 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
7795   if (reachable(src)) {
7796     movl(dst, as_Address(src));
7797   } else {
7798     lea(rscratch1, src);
7799     movl(dst, Address(rscratch1, 0));
7800   }
7801 }
7802 
7803 // C++ bool manipulation
7804 
7805 void MacroAssembler::movbool(Register dst, Address src) {
7806   if(sizeof(bool) == 1)
7807     movb(dst, src);
7808   else if(sizeof(bool) == 2)
7809     movw(dst, src);
7810   else if(sizeof(bool) == 4)
7811     movl(dst, src);
7812   else
7813     // unsupported
7814     ShouldNotReachHere();
7815 }
7816 
7817 void MacroAssembler::movbool(Address dst, bool boolconst) {
7818   if(sizeof(bool) == 1)
7819     movb(dst, (int) boolconst);
7820   else if(sizeof(bool) == 2)
7821     movw(dst, (int) boolconst);
7822   else if(sizeof(bool) == 4)
7823     movl(dst, (int) boolconst);
7824   else
7825     // unsupported
7826     ShouldNotReachHere();
7827 }
7828 
7829 void MacroAssembler::movbool(Address dst, Register src) {
7830   if(sizeof(bool) == 1)
7831     movb(dst, src);
7832   else if(sizeof(bool) == 2)
7833     movw(dst, src);
7834   else if(sizeof(bool) == 4)
7835     movl(dst, src);
7836   else
7837     // unsupported
7838     ShouldNotReachHere();
7839 }
7840 
7841 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
7842   movb(as_Address(dst), src);
7843 }
7844 
7845 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
7846   if (reachable(src)) {
7847     movdl(dst, as_Address(src));
7848   } else {
7849     lea(rscratch1, src);
7850     movdl(dst, Address(rscratch1, 0));
7851   }
7852 }
7853 
7854 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
7855   if (reachable(src)) {
7856     movq(dst, as_Address(src));
7857   } else {
7858     lea(rscratch1, src);
7859     movq(dst, Address(rscratch1, 0));
7860   }
7861 }
7862 
7863 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
7864   if (reachable(src)) {
7865     if (UseXmmLoadAndClearUpper) {
7866       movsd (dst, as_Address(src));
7867     } else {
7868       movlpd(dst, as_Address(src));
7869     }
7870   } else {
7871     lea(rscratch1, src);
7872     if (UseXmmLoadAndClearUpper) {
7873       movsd (dst, Address(rscratch1, 0));
7874     } else {
7875       movlpd(dst, Address(rscratch1, 0));
7876     }
7877   }
7878 }
7879 
7880 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
7881   if (reachable(src)) {
7882     movss(dst, as_Address(src));
7883   } else {
7884     lea(rscratch1, src);
7885     movss(dst, Address(rscratch1, 0));
7886   }
7887 }
7888 
7889 void MacroAssembler::movptr(Register dst, Register src) {
7890   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
7891 }
7892 
7893 void MacroAssembler::movptr(Register dst, Address src) {
7894   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
7895 }
7896 
7897 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
7898 void MacroAssembler::movptr(Register dst, intptr_t src) {
7899   LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
7900 }
7901 
7902 void MacroAssembler::movptr(Address dst, Register src) {
7903   LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
7904 }
7905 
7906 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
7907   if (reachable(src)) {
7908     Assembler::movsd(dst, as_Address(src));
7909   } else {
7910     lea(rscratch1, src);
7911     Assembler::movsd(dst, Address(rscratch1, 0));
7912   }
7913 }
7914 
7915 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
7916   if (reachable(src)) {
7917     Assembler::movss(dst, as_Address(src));
7918   } else {
7919     lea(rscratch1, src);
7920     Assembler::movss(dst, Address(rscratch1, 0));
7921   }
7922 }
7923 
7924 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
7925   if (reachable(src)) {
7926     Assembler::mulsd(dst, as_Address(src));
7927   } else {
7928     lea(rscratch1, src);
7929     Assembler::mulsd(dst, Address(rscratch1, 0));
7930   }
7931 }
7932 
7933 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
7934   if (reachable(src)) {
7935     Assembler::mulss(dst, as_Address(src));
7936   } else {
7937     lea(rscratch1, src);
7938     Assembler::mulss(dst, Address(rscratch1, 0));
7939   }
7940 }
7941 
7942 void MacroAssembler::null_check(Register reg, int offset) {
7943   if (needs_explicit_null_check(offset)) {
7944     // provoke OS NULL exception if reg = NULL by
7945     // accessing M[reg] w/o changing any (non-CC) registers
7946     // NOTE: cmpl is plenty here to provoke a segv
7947     cmpptr(rax, Address(reg, 0));
7948     // Note: should probably use testl(rax, Address(reg, 0));
7949     //       may be shorter code (however, this version of
7950     //       testl needs to be implemented first)
7951   } else {
7952     // nothing to do, (later) access of M[reg + offset]
7953     // will provoke OS NULL exception if reg = NULL
7954   }
7955 }
7956 
7957 void MacroAssembler::os_breakpoint() {
7958   // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
7959   // (e.g., MSVC can't call ps() otherwise)
7960   call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
7961 }
7962 
7963 void MacroAssembler::pop_CPU_state() {
7964   pop_FPU_state();
7965   pop_IU_state();
7966 }
7967 
7968 void MacroAssembler::pop_FPU_state() {
7969   NOT_LP64(frstor(Address(rsp, 0));)
7970   LP64_ONLY(fxrstor(Address(rsp, 0));)
7971   addptr(rsp, FPUStateSizeInWords * wordSize);
7972 }
7973 
7974 void MacroAssembler::pop_IU_state() {
7975   popa();
7976   LP64_ONLY(addq(rsp, 8));
7977   popf();
7978 }
7979 
7980 // Save Integer and Float state
7981 // Warning: Stack must be 16 byte aligned (64bit)
7982 void MacroAssembler::push_CPU_state() {
7983   push_IU_state();
7984   push_FPU_state();
7985 }
7986 
7987 void MacroAssembler::push_FPU_state() {
7988   subptr(rsp, FPUStateSizeInWords * wordSize);
7989 #ifndef _LP64
7990   fnsave(Address(rsp, 0));
7991   fwait();
7992 #else
7993   fxsave(Address(rsp, 0));
7994 #endif // LP64
7995 }
7996 
7997 void MacroAssembler::push_IU_state() {
7998   // Push flags first because pusha kills them
7999   pushf();
8000   // Make sure rsp stays 16-byte aligned
8001   LP64_ONLY(subq(rsp, 8));
8002   pusha();
8003 }
8004 
8005 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp, bool clear_pc) {
8006   // determine java_thread register
8007   if (!java_thread->is_valid()) {
8008     java_thread = rdi;
8009     get_thread(java_thread);
8010   }
8011   // we must set sp to zero to clear frame
8012   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
8013   if (clear_fp) {
8014     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
8015   }
8016 
8017   if (clear_pc)
8018     movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
8019 
8020 }
8021 
8022 void MacroAssembler::restore_rax(Register tmp) {
8023   if (tmp == noreg) pop(rax);
8024   else if (tmp != rax) mov(rax, tmp);
8025 }
8026 
8027 void MacroAssembler::round_to(Register reg, int modulus) {
8028   addptr(reg, modulus - 1);
8029   andptr(reg, -modulus);
8030 }
8031 
8032 void MacroAssembler::save_rax(Register tmp) {
8033   if (tmp == noreg) push(rax);
8034   else if (tmp != rax) mov(tmp, rax);
8035 }
8036 
8037 // Write serialization page so VM thread can do a pseudo remote membar.
8038 // We use the current thread pointer to calculate a thread specific
8039 // offset to write to within the page. This minimizes bus traffic
8040 // due to cache line collision.
8041 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
8042   movl(tmp, thread);
8043   shrl(tmp, os::get_serialize_page_shift_count());
8044   andl(tmp, (os::vm_page_size() - sizeof(int)));
8045 
8046   Address index(noreg, tmp, Address::times_1);
8047   ExternalAddress page(os::get_memory_serialize_page());
8048 
8049   // Size of store must match masking code above
8050   movl(as_Address(ArrayAddress(page, index)), tmp);
8051 }
8052 
8053 // Calls to C land
8054 //
8055 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
8056 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
8057 // has to be reset to 0. This is required to allow proper stack traversal.
8058 void MacroAssembler::set_last_Java_frame(Register java_thread,
8059                                          Register last_java_sp,
8060                                          Register last_java_fp,
8061                                          address  last_java_pc) {
8062   // determine java_thread register
8063   if (!java_thread->is_valid()) {
8064     java_thread = rdi;
8065     get_thread(java_thread);
8066   }
8067   // determine last_java_sp register
8068   if (!last_java_sp->is_valid()) {
8069     last_java_sp = rsp;
8070   }
8071 
8072   // last_java_fp is optional
8073 
8074   if (last_java_fp->is_valid()) {
8075     movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
8076   }
8077 
8078   // last_java_pc is optional
8079 
8080   if (last_java_pc != NULL) {
8081     lea(Address(java_thread,
8082                  JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
8083         InternalAddress(last_java_pc));
8084 
8085   }
8086   movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
8087 }
8088 
8089 void MacroAssembler::shlptr(Register dst, int imm8) {
8090   LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
8091 }
8092 
8093 void MacroAssembler::shrptr(Register dst, int imm8) {
8094   LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
8095 }
8096 
8097 void MacroAssembler::sign_extend_byte(Register reg) {
8098   if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
8099     movsbl(reg, reg); // movsxb
8100   } else {
8101     shll(reg, 24);
8102     sarl(reg, 24);
8103   }
8104 }
8105 
8106 void MacroAssembler::sign_extend_short(Register reg) {
8107   if (LP64_ONLY(true ||) VM_Version::is_P6()) {
8108     movswl(reg, reg); // movsxw
8109   } else {
8110     shll(reg, 16);
8111     sarl(reg, 16);
8112   }
8113 }
8114 
8115 void MacroAssembler::testl(Register dst, AddressLiteral src) {
8116   assert(reachable(src), "Address should be reachable");
8117   testl(dst, as_Address(src));
8118 }
8119 
8120 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
8121   if (reachable(src)) {
8122     Assembler::sqrtsd(dst, as_Address(src));
8123   } else {
8124     lea(rscratch1, src);
8125     Assembler::sqrtsd(dst, Address(rscratch1, 0));
8126   }
8127 }
8128 
8129 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
8130   if (reachable(src)) {
8131     Assembler::sqrtss(dst, as_Address(src));
8132   } else {
8133     lea(rscratch1, src);
8134     Assembler::sqrtss(dst, Address(rscratch1, 0));
8135   }
8136 }
8137 
8138 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
8139   if (reachable(src)) {
8140     Assembler::subsd(dst, as_Address(src));
8141   } else {
8142     lea(rscratch1, src);
8143     Assembler::subsd(dst, Address(rscratch1, 0));
8144   }
8145 }
8146 
8147 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
8148   if (reachable(src)) {
8149     Assembler::subss(dst, as_Address(src));
8150   } else {
8151     lea(rscratch1, src);
8152     Assembler::subss(dst, Address(rscratch1, 0));
8153   }
8154 }
8155 
8156 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
8157   if (reachable(src)) {
8158     Assembler::ucomisd(dst, as_Address(src));
8159   } else {
8160     lea(rscratch1, src);
8161     Assembler::ucomisd(dst, Address(rscratch1, 0));
8162   }
8163 }
8164 
8165 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
8166   if (reachable(src)) {
8167     Assembler::ucomiss(dst, as_Address(src));
8168   } else {
8169     lea(rscratch1, src);
8170     Assembler::ucomiss(dst, Address(rscratch1, 0));
8171   }
8172 }
8173 
8174 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
8175   // Used in sign-bit flipping with aligned address.
8176   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
8177   if (reachable(src)) {
8178     Assembler::xorpd(dst, as_Address(src));
8179   } else {
8180     lea(rscratch1, src);
8181     Assembler::xorpd(dst, Address(rscratch1, 0));
8182   }
8183 }
8184 
8185 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
8186   // Used in sign-bit flipping with aligned address.
8187   assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
8188   if (reachable(src)) {
8189     Assembler::xorps(dst, as_Address(src));
8190   } else {
8191     lea(rscratch1, src);
8192     Assembler::xorps(dst, Address(rscratch1, 0));
8193   }
8194 }
8195 
8196 // AVX 3-operands instructions
8197 
8198 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
8199   if (reachable(src)) {
8200     vaddsd(dst, nds, as_Address(src));
8201   } else {
8202     lea(rscratch1, src);
8203     vaddsd(dst, nds, Address(rscratch1, 0));
8204   }
8205 }
8206 
8207 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
8208   if (reachable(src)) {
8209     vaddss(dst, nds, as_Address(src));
8210   } else {
8211     lea(rscratch1, src);
8212     vaddss(dst, nds, Address(rscratch1, 0));
8213   }
8214 }
8215 
8216 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
8217   if (reachable(src)) {
8218     vandpd(dst, nds, as_Address(src), vector256);
8219   } else {
8220     lea(rscratch1, src);
8221     vandpd(dst, nds, Address(rscratch1, 0), vector256);
8222   }
8223 }
8224 
8225 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
8226   if (reachable(src)) {
8227     vandps(dst, nds, as_Address(src), vector256);
8228   } else {
8229     lea(rscratch1, src);
8230     vandps(dst, nds, Address(rscratch1, 0), vector256);
8231   }
8232 }
8233 
8234 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
8235   if (reachable(src)) {
8236     vdivsd(dst, nds, as_Address(src));
8237   } else {
8238     lea(rscratch1, src);
8239     vdivsd(dst, nds, Address(rscratch1, 0));
8240   }
8241 }
8242 
8243 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
8244   if (reachable(src)) {
8245     vdivss(dst, nds, as_Address(src));
8246   } else {
8247     lea(rscratch1, src);
8248     vdivss(dst, nds, Address(rscratch1, 0));
8249   }
8250 }
8251 
8252 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
8253   if (reachable(src)) {
8254     vmulsd(dst, nds, as_Address(src));
8255   } else {
8256     lea(rscratch1, src);
8257     vmulsd(dst, nds, Address(rscratch1, 0));
8258   }
8259 }
8260 
8261 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
8262   if (reachable(src)) {
8263     vmulss(dst, nds, as_Address(src));
8264   } else {
8265     lea(rscratch1, src);
8266     vmulss(dst, nds, Address(rscratch1, 0));
8267   }
8268 }
8269 
8270 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
8271   if (reachable(src)) {
8272     vsubsd(dst, nds, as_Address(src));
8273   } else {
8274     lea(rscratch1, src);
8275     vsubsd(dst, nds, Address(rscratch1, 0));
8276   }
8277 }
8278 
8279 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
8280   if (reachable(src)) {
8281     vsubss(dst, nds, as_Address(src));
8282   } else {
8283     lea(rscratch1, src);
8284     vsubss(dst, nds, Address(rscratch1, 0));
8285   }
8286 }
8287 
8288 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
8289   if (reachable(src)) {
8290     vxorpd(dst, nds, as_Address(src), vector256);
8291   } else {
8292     lea(rscratch1, src);
8293     vxorpd(dst, nds, Address(rscratch1, 0), vector256);
8294   }
8295 }
8296 
8297 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
8298   if (reachable(src)) {
8299     vxorps(dst, nds, as_Address(src), vector256);
8300   } else {
8301     lea(rscratch1, src);
8302     vxorps(dst, nds, Address(rscratch1, 0), vector256);
8303   }
8304 }
8305 
8306 
8307 //////////////////////////////////////////////////////////////////////////////////
8308 #ifndef SERIALGC
8309 
8310 void MacroAssembler::g1_write_barrier_pre(Register obj,
8311                                           Register pre_val,
8312                                           Register thread,
8313                                           Register tmp,
8314                                           bool tosca_live,
8315                                           bool expand_call) {
8316 
8317   // If expand_call is true then we expand the call_VM_leaf macro
8318   // directly to skip generating the check by
8319   // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
8320 
8321 #ifdef _LP64
8322   assert(thread == r15_thread, "must be");
8323 #endif // _LP64
8324 
8325   Label done;
8326   Label runtime;
8327 
8328   assert(pre_val != noreg, "check this code");
8329 
8330   if (obj != noreg) {
8331     assert_different_registers(obj, pre_val, tmp);
8332     assert(pre_val != rax, "check this code");
8333   }
8334 
8335   Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
8336                                        PtrQueue::byte_offset_of_active()));
8337   Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
8338                                        PtrQueue::byte_offset_of_index()));
8339   Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
8340                                        PtrQueue::byte_offset_of_buf()));
8341 
8342 
8343   // Is marking active?
8344   if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
8345     cmpl(in_progress, 0);
8346   } else {
8347     assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
8348     cmpb(in_progress, 0);
8349   }
8350   jcc(Assembler::equal, done);
8351 
8352   // Do we need to load the previous value?
8353   if (obj != noreg) {
8354     load_heap_oop(pre_val, Address(obj, 0));
8355   }
8356 
8357   // Is the previous value null?
8358   cmpptr(pre_val, (int32_t) NULL_WORD);
8359   jcc(Assembler::equal, done);
8360 
8361   // Can we store original value in the thread's buffer?
8362   // Is index == 0?
8363   // (The index field is typed as size_t.)
8364 
8365   movptr(tmp, index);                   // tmp := *index_adr
8366   cmpptr(tmp, 0);                       // tmp == 0?
8367   jcc(Assembler::equal, runtime);       // If yes, goto runtime
8368 
8369   subptr(tmp, wordSize);                // tmp := tmp - wordSize
8370   movptr(index, tmp);                   // *index_adr := tmp
8371   addptr(tmp, buffer);                  // tmp := tmp + *buffer_adr
8372 
8373   // Record the previous value
8374   movptr(Address(tmp, 0), pre_val);
8375   jmp(done);
8376 
8377   bind(runtime);
8378   // save the live input values
8379   if(tosca_live) push(rax);
8380 
8381   if (obj != noreg && obj != rax)
8382     push(obj);
8383 
8384   if (pre_val != rax)
8385     push(pre_val);
8386 
8387   // Calling the runtime using the regular call_VM_leaf mechanism generates
8388   // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
8389   // that checks that the *(ebp+frame::interpreter_frame_last_sp) == NULL.
8390   //
8391   // If we care generating the pre-barrier without a frame (e.g. in the
8392   // intrinsified Reference.get() routine) then ebp might be pointing to
8393   // the caller frame and so this check will most likely fail at runtime.
8394   //
8395   // Expanding the call directly bypasses the generation of the check.
8396   // So when we do not have have a full interpreter frame on the stack
8397   // expand_call should be passed true.
8398 
8399   NOT_LP64( push(thread); )
8400 
8401   if (expand_call) {
8402     LP64_ONLY( assert(pre_val != c_rarg1, "smashed arg"); )
8403     pass_arg1(this, thread);
8404     pass_arg0(this, pre_val);
8405     MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
8406   } else {
8407     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
8408   }
8409 
8410   NOT_LP64( pop(thread); )
8411 
8412   // save the live input values
8413   if (pre_val != rax)
8414     pop(pre_val);
8415 
8416   if (obj != noreg && obj != rax)
8417     pop(obj);
8418 
8419   if(tosca_live) pop(rax);
8420 
8421   bind(done);
8422 }
8423 
8424 void MacroAssembler::g1_write_barrier_post(Register store_addr,
8425                                            Register new_val,
8426                                            Register thread,
8427                                            Register tmp,
8428                                            Register tmp2) {
8429 #ifdef _LP64
8430   assert(thread == r15_thread, "must be");
8431 #endif // _LP64
8432 
8433   Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
8434                                        PtrQueue::byte_offset_of_index()));
8435   Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
8436                                        PtrQueue::byte_offset_of_buf()));
8437 
8438   BarrierSet* bs = Universe::heap()->barrier_set();
8439   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
8440   Label done;
8441   Label runtime;
8442 
8443   // Does store cross heap regions?
8444 
8445   movptr(tmp, store_addr);
8446   xorptr(tmp, new_val);
8447   shrptr(tmp, HeapRegion::LogOfHRGrainBytes);
8448   jcc(Assembler::equal, done);
8449 
8450   // crosses regions, storing NULL?
8451 
8452   cmpptr(new_val, (int32_t) NULL_WORD);
8453   jcc(Assembler::equal, done);
8454 
8455   // storing region crossing non-NULL, is card already dirty?
8456 
8457   ExternalAddress cardtable((address) ct->byte_map_base);
8458   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
8459 #ifdef _LP64
8460   const Register card_addr = tmp;
8461 
8462   movq(card_addr, store_addr);
8463   shrq(card_addr, CardTableModRefBS::card_shift);
8464 
8465   lea(tmp2, cardtable);
8466 
8467   // get the address of the card
8468   addq(card_addr, tmp2);
8469 #else
8470   const Register card_index = tmp;
8471 
8472   movl(card_index, store_addr);
8473   shrl(card_index, CardTableModRefBS::card_shift);
8474 
8475   Address index(noreg, card_index, Address::times_1);
8476   const Register card_addr = tmp;
8477   lea(card_addr, as_Address(ArrayAddress(cardtable, index)));
8478 #endif
8479   cmpb(Address(card_addr, 0), 0);
8480   jcc(Assembler::equal, done);
8481 
8482   // storing a region crossing, non-NULL oop, card is clean.
8483   // dirty card and log.
8484 
8485   movb(Address(card_addr, 0), 0);
8486 
8487   cmpl(queue_index, 0);
8488   jcc(Assembler::equal, runtime);
8489   subl(queue_index, wordSize);
8490   movptr(tmp2, buffer);
8491 #ifdef _LP64
8492   movslq(rscratch1, queue_index);
8493   addq(tmp2, rscratch1);
8494   movq(Address(tmp2, 0), card_addr);
8495 #else
8496   addl(tmp2, queue_index);
8497   movl(Address(tmp2, 0), card_index);
8498 #endif
8499   jmp(done);
8500 
8501   bind(runtime);
8502   // save the live input values
8503   push(store_addr);
8504   push(new_val);
8505 #ifdef _LP64
8506   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, r15_thread);
8507 #else
8508   push(thread);
8509   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
8510   pop(thread);
8511 #endif
8512   pop(new_val);
8513   pop(store_addr);
8514 
8515   bind(done);
8516 }
8517 
8518 #endif // SERIALGC
8519 //////////////////////////////////////////////////////////////////////////////////
8520 
8521 
8522 void MacroAssembler::store_check(Register obj) {
8523   // Does a store check for the oop in register obj. The content of
8524   // register obj is destroyed afterwards.
8525   store_check_part_1(obj);
8526   store_check_part_2(obj);
8527 }
8528 
8529 void MacroAssembler::store_check(Register obj, Address dst) {
8530   store_check(obj);
8531 }
8532 
8533 
8534 // split the store check operation so that other instructions can be scheduled inbetween
8535 void MacroAssembler::store_check_part_1(Register obj) {
8536   BarrierSet* bs = Universe::heap()->barrier_set();
8537   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
8538   shrptr(obj, CardTableModRefBS::card_shift);
8539 }
8540 
8541 void MacroAssembler::store_check_part_2(Register obj) {
8542   BarrierSet* bs = Universe::heap()->barrier_set();
8543   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
8544   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
8545   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
8546 
8547   // The calculation for byte_map_base is as follows:
8548   // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
8549   // So this essentially converts an address to a displacement and
8550   // it will never need to be relocated. On 64bit however the value may be too
8551   // large for a 32bit displacement
8552 
8553   intptr_t disp = (intptr_t) ct->byte_map_base;
8554   if (is_simm32(disp)) {
8555     Address cardtable(noreg, obj, Address::times_1, disp);
8556     movb(cardtable, 0);
8557   } else {
8558     // By doing it as an ExternalAddress disp could be converted to a rip-relative
8559     // displacement and done in a single instruction given favorable mapping and
8560     // a smarter version of as_Address. Worst case it is two instructions which
8561     // is no worse off then loading disp into a register and doing as a simple
8562     // Address() as above.
8563     // We can't do as ExternalAddress as the only style since if disp == 0 we'll
8564     // assert since NULL isn't acceptable in a reloci (see 6644928). In any case
8565     // in some cases we'll get a single instruction version.
8566 
8567     ExternalAddress cardtable((address)disp);
8568     Address index(noreg, obj, Address::times_1);
8569     movb(as_Address(ArrayAddress(cardtable, index)), 0);
8570   }
8571 }
8572 
8573 void MacroAssembler::subptr(Register dst, int32_t imm32) {
8574   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
8575 }
8576 
8577 // Force generation of a 4 byte immediate value even if it fits into 8bit
8578 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
8579   LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
8580 }
8581 
8582 void MacroAssembler::subptr(Register dst, Register src) {
8583   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
8584 }
8585 
8586 // C++ bool manipulation
8587 void MacroAssembler::testbool(Register dst) {
8588   if(sizeof(bool) == 1)
8589     testb(dst, 0xff);
8590   else if(sizeof(bool) == 2) {
8591     // testw implementation needed for two byte bools
8592     ShouldNotReachHere();
8593   } else if(sizeof(bool) == 4)
8594     testl(dst, dst);
8595   else
8596     // unsupported
8597     ShouldNotReachHere();
8598 }
8599 
8600 void MacroAssembler::testptr(Register dst, Register src) {
8601   LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
8602 }
8603 
8604 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
8605 void MacroAssembler::tlab_allocate(Register obj,
8606                                    Register var_size_in_bytes,
8607                                    int con_size_in_bytes,
8608                                    Register t1,
8609                                    Register t2,
8610                                    Label& slow_case) {
8611   assert_different_registers(obj, t1, t2);
8612   assert_different_registers(obj, var_size_in_bytes, t1);
8613   Register end = t2;
8614   Register thread = NOT_LP64(t1) LP64_ONLY(r15_thread);
8615 
8616   verify_tlab();
8617 
8618   NOT_LP64(get_thread(thread));
8619 
8620   movptr(obj, Address(thread, JavaThread::tlab_top_offset()));
8621   if (var_size_in_bytes == noreg) {
8622     lea(end, Address(obj, con_size_in_bytes));
8623   } else {
8624     lea(end, Address(obj, var_size_in_bytes, Address::times_1));
8625   }
8626   cmpptr(end, Address(thread, JavaThread::tlab_end_offset()));
8627   jcc(Assembler::above, slow_case);
8628 
8629   // update the tlab top pointer
8630   movptr(Address(thread, JavaThread::tlab_top_offset()), end);
8631 
8632   // recover var_size_in_bytes if necessary
8633   if (var_size_in_bytes == end) {
8634     subptr(var_size_in_bytes, obj);
8635   }
8636   verify_tlab();
8637 }
8638 
8639 // Preserves rbx, and rdx.
8640 Register MacroAssembler::tlab_refill(Label& retry,
8641                                      Label& try_eden,
8642                                      Label& slow_case) {
8643   Register top = rax;
8644   Register t1  = rcx;
8645   Register t2  = rsi;
8646   Register thread_reg = NOT_LP64(rdi) LP64_ONLY(r15_thread);
8647   assert_different_registers(top, thread_reg, t1, t2, /* preserve: */ rbx, rdx);
8648   Label do_refill, discard_tlab;
8649 
8650   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
8651     // No allocation in the shared eden.
8652     jmp(slow_case);
8653   }
8654 
8655   NOT_LP64(get_thread(thread_reg));
8656 
8657   movptr(top, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
8658   movptr(t1,  Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
8659 
8660   // calculate amount of free space
8661   subptr(t1, top);
8662   shrptr(t1, LogHeapWordSize);
8663 
8664   // Retain tlab and allocate object in shared space if
8665   // the amount free in the tlab is too large to discard.
8666   cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
8667   jcc(Assembler::lessEqual, discard_tlab);
8668 
8669   // Retain
8670   // %%% yuck as movptr...
8671   movptr(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
8672   addptr(Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())), t2);
8673   if (TLABStats) {
8674     // increment number of slow_allocations
8675     addl(Address(thread_reg, in_bytes(JavaThread::tlab_slow_allocations_offset())), 1);
8676   }
8677   jmp(try_eden);
8678 
8679   bind(discard_tlab);
8680   if (TLABStats) {
8681     // increment number of refills
8682     addl(Address(thread_reg, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1);
8683     // accumulate wastage -- t1 is amount free in tlab
8684     addl(Address(thread_reg, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1);
8685   }
8686 
8687   // if tlab is currently allocated (top or end != null) then
8688   // fill [top, end + alignment_reserve) with array object
8689   testptr(top, top);
8690   jcc(Assembler::zero, do_refill);
8691 
8692   // set up the mark word
8693   movptr(Address(top, oopDesc::mark_offset_in_bytes()), (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
8694   // set the length to the remaining space
8695   subptr(t1, typeArrayOopDesc::header_size(T_INT));
8696   addptr(t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
8697   shlptr(t1, log2_intptr(HeapWordSize/sizeof(jint)));
8698   movl(Address(top, arrayOopDesc::length_offset_in_bytes()), t1);
8699   // set klass to intArrayKlass
8700   // dubious reloc why not an oop reloc?
8701   movptr(t1, ExternalAddress((address)Universe::intArrayKlassObj_addr()));
8702   // store klass last.  concurrent gcs assumes klass length is valid if
8703   // klass field is not null.
8704   store_klass(top, t1);
8705 
8706   movptr(t1, top);
8707   subptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
8708   incr_allocated_bytes(thread_reg, t1, 0);
8709 
8710   // refill the tlab with an eden allocation
8711   bind(do_refill);
8712   movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
8713   shlptr(t1, LogHeapWordSize);
8714   // allocate new tlab, address returned in top
8715   eden_allocate(top, t1, 0, t2, slow_case);
8716 
8717   // Check that t1 was preserved in eden_allocate.
8718 #ifdef ASSERT
8719   if (UseTLAB) {
8720     Label ok;
8721     Register tsize = rsi;
8722     assert_different_registers(tsize, thread_reg, t1);
8723     push(tsize);
8724     movptr(tsize, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
8725     shlptr(tsize, LogHeapWordSize);
8726     cmpptr(t1, tsize);
8727     jcc(Assembler::equal, ok);
8728     stop("assert(t1 != tlab size)");
8729     should_not_reach_here();
8730 
8731     bind(ok);
8732     pop(tsize);
8733   }
8734 #endif
8735   movptr(Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())), top);
8736   movptr(Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())), top);
8737   addptr(top, t1);
8738   subptr(top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
8739   movptr(Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())), top);
8740   verify_tlab();
8741   jmp(retry);
8742 
8743   return thread_reg; // for use by caller
8744 }
8745 
8746 void MacroAssembler::incr_allocated_bytes(Register thread,
8747                                           Register var_size_in_bytes,
8748                                           int con_size_in_bytes,
8749                                           Register t1) {
8750   if (!thread->is_valid()) {
8751 #ifdef _LP64
8752     thread = r15_thread;
8753 #else
8754     assert(t1->is_valid(), "need temp reg");
8755     thread = t1;
8756     get_thread(thread);
8757 #endif
8758   }
8759 
8760 #ifdef _LP64
8761   if (var_size_in_bytes->is_valid()) {
8762     addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
8763   } else {
8764     addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
8765   }
8766 #else
8767   if (var_size_in_bytes->is_valid()) {
8768     addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
8769   } else {
8770     addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
8771   }
8772   adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
8773 #endif
8774 }
8775 
8776 void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
8777   pusha();
8778 
8779   // if we are coming from c1, xmm registers may be live
8780   if (UseSSE >= 1) {
8781     subptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8));
8782   }
8783   int off = 0;
8784   if (UseSSE == 1)  {
8785     movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
8786     movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
8787     movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
8788     movflt(Address(rsp,off++*sizeof(jdouble)),xmm3);
8789     movflt(Address(rsp,off++*sizeof(jdouble)),xmm4);
8790     movflt(Address(rsp,off++*sizeof(jdouble)),xmm5);
8791     movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
8792     movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
8793   } else if (UseSSE >= 2)  {
8794     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm0);
8795     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm1);
8796     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm2);
8797     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm3);
8798     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm4);
8799     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm5);
8800     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm6);
8801     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm7);
8802 #ifdef _LP64
8803     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm8);
8804     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm9);
8805     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm10);
8806     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm11);
8807     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm12);
8808     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm13);
8809     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm14);
8810     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm15);
8811 #endif
8812   }
8813 
8814   // Preserve registers across runtime call
8815   int incoming_argument_and_return_value_offset = -1;
8816   if (num_fpu_regs_in_use > 1) {
8817     // Must preserve all other FPU regs (could alternatively convert
8818     // SharedRuntime::dsin, dcos etc. into assembly routines known not to trash
8819     // FPU state, but can not trust C compiler)
8820     NEEDS_CLEANUP;
8821     // NOTE that in this case we also push the incoming argument(s) to
8822     // the stack and restore it later; we also use this stack slot to
8823     // hold the return value from dsin, dcos etc.
8824     for (int i = 0; i < num_fpu_regs_in_use; i++) {
8825       subptr(rsp, sizeof(jdouble));
8826       fstp_d(Address(rsp, 0));
8827     }
8828     incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
8829     for (int i = nb_args-1; i >= 0; i--) {
8830       fld_d(Address(rsp, incoming_argument_and_return_value_offset-i*sizeof(jdouble)));
8831     }
8832   }
8833 
8834   subptr(rsp, nb_args*sizeof(jdouble));
8835   for (int i = 0; i < nb_args; i++) {
8836     fstp_d(Address(rsp, i*sizeof(jdouble)));
8837   }
8838 
8839 #ifdef _LP64
8840   if (nb_args > 0) {
8841     movdbl(xmm0, Address(rsp, 0));
8842   }
8843   if (nb_args > 1) {
8844     movdbl(xmm1, Address(rsp, sizeof(jdouble)));
8845   }
8846   assert(nb_args <= 2, "unsupported number of args");
8847 #endif // _LP64
8848 
8849   // NOTE: we must not use call_VM_leaf here because that requires a
8850   // complete interpreter frame in debug mode -- same bug as 4387334
8851   // MacroAssembler::call_VM_leaf_base is perfectly safe and will
8852   // do proper 64bit abi
8853 
8854   NEEDS_CLEANUP;
8855   // Need to add stack banging before this runtime call if it needs to
8856   // be taken; however, there is no generic stack banging routine at
8857   // the MacroAssembler level
8858 
8859   MacroAssembler::call_VM_leaf_base(runtime_entry, 0);
8860 
8861 #ifdef _LP64
8862   movsd(Address(rsp, 0), xmm0);
8863   fld_d(Address(rsp, 0));
8864 #endif // _LP64
8865   addptr(rsp, sizeof(jdouble) * nb_args);
8866   if (num_fpu_regs_in_use > 1) {
8867     // Must save return value to stack and then restore entire FPU
8868     // stack except incoming arguments
8869     fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
8870     for (int i = 0; i < num_fpu_regs_in_use - nb_args; i++) {
8871       fld_d(Address(rsp, 0));
8872       addptr(rsp, sizeof(jdouble));
8873     }
8874     fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
8875     addptr(rsp, sizeof(jdouble) * nb_args);
8876   }
8877 
8878   off = 0;
8879   if (UseSSE == 1)  {
8880     movflt(xmm0, Address(rsp,off++*sizeof(jdouble)));
8881     movflt(xmm1, Address(rsp,off++*sizeof(jdouble)));
8882     movflt(xmm2, Address(rsp,off++*sizeof(jdouble)));
8883     movflt(xmm3, Address(rsp,off++*sizeof(jdouble)));
8884     movflt(xmm4, Address(rsp,off++*sizeof(jdouble)));
8885     movflt(xmm5, Address(rsp,off++*sizeof(jdouble)));
8886     movflt(xmm6, Address(rsp,off++*sizeof(jdouble)));
8887     movflt(xmm7, Address(rsp,off++*sizeof(jdouble)));
8888   } else if (UseSSE >= 2)  {
8889     movdbl(xmm0, Address(rsp,off++*sizeof(jdouble)));
8890     movdbl(xmm1, Address(rsp,off++*sizeof(jdouble)));
8891     movdbl(xmm2, Address(rsp,off++*sizeof(jdouble)));
8892     movdbl(xmm3, Address(rsp,off++*sizeof(jdouble)));
8893     movdbl(xmm4, Address(rsp,off++*sizeof(jdouble)));
8894     movdbl(xmm5, Address(rsp,off++*sizeof(jdouble)));
8895     movdbl(xmm6, Address(rsp,off++*sizeof(jdouble)));
8896     movdbl(xmm7, Address(rsp,off++*sizeof(jdouble)));
8897 #ifdef _LP64
8898     movdbl(xmm8, Address(rsp,off++*sizeof(jdouble)));
8899     movdbl(xmm9, Address(rsp,off++*sizeof(jdouble)));
8900     movdbl(xmm10, Address(rsp,off++*sizeof(jdouble)));
8901     movdbl(xmm11, Address(rsp,off++*sizeof(jdouble)));
8902     movdbl(xmm12, Address(rsp,off++*sizeof(jdouble)));
8903     movdbl(xmm13, Address(rsp,off++*sizeof(jdouble)));
8904     movdbl(xmm14, Address(rsp,off++*sizeof(jdouble)));
8905     movdbl(xmm15, Address(rsp,off++*sizeof(jdouble)));
8906 #endif
8907   }
8908   if (UseSSE >= 1) {
8909     addptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8));
8910   }
8911   popa();
8912 }
8913 
8914 static const double     pi_4 =  0.7853981633974483;
8915 
8916 void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
8917   // A hand-coded argument reduction for values in fabs(pi/4, pi/2)
8918   // was attempted in this code; unfortunately it appears that the
8919   // switch to 80-bit precision and back causes this to be
8920   // unprofitable compared with simply performing a runtime call if
8921   // the argument is out of the (-pi/4, pi/4) range.
8922 
8923   Register tmp = noreg;
8924   if (!VM_Version::supports_cmov()) {
8925     // fcmp needs a temporary so preserve rbx,
8926     tmp = rbx;
8927     push(tmp);
8928   }
8929 
8930   Label slow_case, done;
8931 
8932   ExternalAddress pi4_adr = (address)&pi_4;
8933   if (reachable(pi4_adr)) {
8934     // x ?<= pi/4
8935     fld_d(pi4_adr);
8936     fld_s(1);                // Stack:  X  PI/4  X
8937     fabs();                  // Stack: |X| PI/4  X
8938     fcmp(tmp);
8939     jcc(Assembler::above, slow_case);
8940 
8941     // fastest case: -pi/4 <= x <= pi/4
8942     switch(trig) {
8943     case 's':
8944       fsin();
8945       break;
8946     case 'c':
8947       fcos();
8948       break;
8949     case 't':
8950       ftan();
8951       break;
8952     default:
8953       assert(false, "bad intrinsic");
8954       break;
8955     }
8956     jmp(done);
8957   }
8958 
8959   // slow case: runtime call
8960   bind(slow_case);
8961 
8962   switch(trig) {
8963   case 's':
8964     {
8965       fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 1, num_fpu_regs_in_use);
8966     }
8967     break;
8968   case 'c':
8969     {
8970       fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 1, num_fpu_regs_in_use);
8971     }
8972     break;
8973   case 't':
8974     {
8975       fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 1, num_fpu_regs_in_use);
8976     }
8977     break;
8978   default:
8979     assert(false, "bad intrinsic");
8980     break;
8981   }
8982 
8983   // Come here with result in F-TOS
8984   bind(done);
8985 
8986   if (tmp != noreg) {
8987     pop(tmp);
8988   }
8989 }
8990 
8991 
8992 // Look up the method for a megamorphic invokeinterface call.
8993 // The target method is determined by <intf_klass, itable_index>.
8994 // The receiver klass is in recv_klass.
8995 // On success, the result will be in method_result, and execution falls through.
8996 // On failure, execution transfers to the given label.
8997 void MacroAssembler::lookup_interface_method(Register recv_klass,
8998                                              Register intf_klass,
8999                                              RegisterOrConstant itable_index,
9000                                              Register method_result,
9001                                              Register scan_temp,
9002                                              Label& L_no_such_interface) {
9003   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
9004   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
9005          "caller must use same register for non-constant itable index as for method");
9006 
9007   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
9008   int vtable_base = instanceKlass::vtable_start_offset() * wordSize;
9009   int itentry_off = itableMethodEntry::method_offset_in_bytes();
9010   int scan_step   = itableOffsetEntry::size() * wordSize;
9011   int vte_size    = vtableEntry::size() * wordSize;
9012   Address::ScaleFactor times_vte_scale = Address::times_ptr;
9013   assert(vte_size == wordSize, "else adjust times_vte_scale");
9014 
9015   movl(scan_temp, Address(recv_klass, instanceKlass::vtable_length_offset() * wordSize));
9016 
9017   // %%% Could store the aligned, prescaled offset in the klassoop.
9018   lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
9019   if (HeapWordsPerLong > 1) {
9020     // Round up to align_object_offset boundary
9021     // see code for instanceKlass::start_of_itable!
9022     round_to(scan_temp, BytesPerLong);
9023   }
9024 
9025   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
9026   assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
9027   lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
9028 
9029   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
9030   //   if (scan->interface() == intf) {
9031   //     result = (klass + scan->offset() + itable_index);
9032   //   }
9033   // }
9034   Label search, found_method;
9035 
9036   for (int peel = 1; peel >= 0; peel--) {
9037     movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
9038     cmpptr(intf_klass, method_result);
9039 
9040     if (peel) {
9041       jccb(Assembler::equal, found_method);
9042     } else {
9043       jccb(Assembler::notEqual, search);
9044       // (invert the test to fall through to found_method...)
9045     }
9046 
9047     if (!peel)  break;
9048 
9049     bind(search);
9050 
9051     // Check that the previous entry is non-null.  A null entry means that
9052     // the receiver class doesn't implement the interface, and wasn't the
9053     // same as when the caller was compiled.
9054     testptr(method_result, method_result);
9055     jcc(Assembler::zero, L_no_such_interface);
9056     addptr(scan_temp, scan_step);
9057   }
9058 
9059   bind(found_method);
9060 
9061   // Got a hit.
9062   movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
9063   movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
9064 }
9065 
9066 
9067 void MacroAssembler::check_klass_subtype(Register sub_klass,
9068                            Register super_klass,
9069                            Register temp_reg,
9070                            Label& L_success) {
9071   Label L_failure;
9072   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
9073   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
9074   bind(L_failure);
9075 }
9076 
9077 
9078 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
9079                                                    Register super_klass,
9080                                                    Register temp_reg,
9081                                                    Label* L_success,
9082                                                    Label* L_failure,
9083                                                    Label* L_slow_path,
9084                                         RegisterOrConstant super_check_offset) {
9085   assert_different_registers(sub_klass, super_klass, temp_reg);
9086   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
9087   if (super_check_offset.is_register()) {
9088     assert_different_registers(sub_klass, super_klass,
9089                                super_check_offset.as_register());
9090   } else if (must_load_sco) {
9091     assert(temp_reg != noreg, "supply either a temp or a register offset");
9092   }
9093 
9094   Label L_fallthrough;
9095   int label_nulls = 0;
9096   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
9097   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
9098   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
9099   assert(label_nulls <= 1, "at most one NULL in the batch");
9100 
9101   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
9102   int sco_offset = in_bytes(Klass::super_check_offset_offset());
9103   Address super_check_offset_addr(super_klass, sco_offset);
9104 
9105   // Hacked jcc, which "knows" that L_fallthrough, at least, is in
9106   // range of a jccb.  If this routine grows larger, reconsider at
9107   // least some of these.
9108 #define local_jcc(assembler_cond, label)                                \
9109   if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
9110   else                             jcc( assembler_cond, label) /*omit semi*/
9111 
9112   // Hacked jmp, which may only be used just before L_fallthrough.
9113 #define final_jmp(label)                                                \
9114   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
9115   else                            jmp(label)                /*omit semi*/
9116 
9117   // If the pointers are equal, we are done (e.g., String[] elements).
9118   // This self-check enables sharing of secondary supertype arrays among
9119   // non-primary types such as array-of-interface.  Otherwise, each such
9120   // type would need its own customized SSA.
9121   // We move this check to the front of the fast path because many
9122   // type checks are in fact trivially successful in this manner,
9123   // so we get a nicely predicted branch right at the start of the check.
9124   cmpptr(sub_klass, super_klass);
9125   local_jcc(Assembler::equal, *L_success);
9126 
9127   // Check the supertype display:
9128   if (must_load_sco) {
9129     // Positive movl does right thing on LP64.
9130     movl(temp_reg, super_check_offset_addr);
9131     super_check_offset = RegisterOrConstant(temp_reg);
9132   }
9133   Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
9134   cmpptr(super_klass, super_check_addr); // load displayed supertype
9135 
9136   // This check has worked decisively for primary supers.
9137   // Secondary supers are sought in the super_cache ('super_cache_addr').
9138   // (Secondary supers are interfaces and very deeply nested subtypes.)
9139   // This works in the same check above because of a tricky aliasing
9140   // between the super_cache and the primary super display elements.
9141   // (The 'super_check_addr' can address either, as the case requires.)
9142   // Note that the cache is updated below if it does not help us find
9143   // what we need immediately.
9144   // So if it was a primary super, we can just fail immediately.
9145   // Otherwise, it's the slow path for us (no success at this point).
9146 
9147   if (super_check_offset.is_register()) {
9148     local_jcc(Assembler::equal, *L_success);
9149     cmpl(super_check_offset.as_register(), sc_offset);
9150     if (L_failure == &L_fallthrough) {
9151       local_jcc(Assembler::equal, *L_slow_path);
9152     } else {
9153       local_jcc(Assembler::notEqual, *L_failure);
9154       final_jmp(*L_slow_path);
9155     }
9156   } else if (super_check_offset.as_constant() == sc_offset) {
9157     // Need a slow path; fast failure is impossible.
9158     if (L_slow_path == &L_fallthrough) {
9159       local_jcc(Assembler::equal, *L_success);
9160     } else {
9161       local_jcc(Assembler::notEqual, *L_slow_path);
9162       final_jmp(*L_success);
9163     }
9164   } else {
9165     // No slow path; it's a fast decision.
9166     if (L_failure == &L_fallthrough) {
9167       local_jcc(Assembler::equal, *L_success);
9168     } else {
9169       local_jcc(Assembler::notEqual, *L_failure);
9170       final_jmp(*L_success);
9171     }
9172   }
9173 
9174   bind(L_fallthrough);
9175 
9176 #undef local_jcc
9177 #undef final_jmp
9178 }
9179 
9180 
9181 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
9182                                                    Register super_klass,
9183                                                    Register temp_reg,
9184                                                    Register temp2_reg,
9185                                                    Label* L_success,
9186                                                    Label* L_failure,
9187                                                    bool set_cond_codes) {
9188   assert_different_registers(sub_klass, super_klass, temp_reg);
9189   if (temp2_reg != noreg)
9190     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
9191 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
9192 
9193   Label L_fallthrough;
9194   int label_nulls = 0;
9195   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
9196   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
9197   assert(label_nulls <= 1, "at most one NULL in the batch");
9198 
9199   // a couple of useful fields in sub_klass:
9200   int ss_offset = in_bytes(Klass::secondary_supers_offset());
9201   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
9202   Address secondary_supers_addr(sub_klass, ss_offset);
9203   Address super_cache_addr(     sub_klass, sc_offset);
9204 
9205   // Do a linear scan of the secondary super-klass chain.
9206   // This code is rarely used, so simplicity is a virtue here.
9207   // The repne_scan instruction uses fixed registers, which we must spill.
9208   // Don't worry too much about pre-existing connections with the input regs.
9209 
9210   assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
9211   assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
9212 
9213   // Get super_klass value into rax (even if it was in rdi or rcx).
9214   bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
9215   if (super_klass != rax || UseCompressedOops) {
9216     if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
9217     mov(rax, super_klass);
9218   }
9219   if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
9220   if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
9221 
9222 #ifndef PRODUCT
9223   int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
9224   ExternalAddress pst_counter_addr((address) pst_counter);
9225   NOT_LP64(  incrementl(pst_counter_addr) );
9226   LP64_ONLY( lea(rcx, pst_counter_addr) );
9227   LP64_ONLY( incrementl(Address(rcx, 0)) );
9228 #endif //PRODUCT
9229 
9230   // We will consult the secondary-super array.
9231   movptr(rdi, secondary_supers_addr);
9232   // Load the array length.  (Positive movl does right thing on LP64.)
9233   movl(rcx, Address(rdi, arrayOopDesc::length_offset_in_bytes()));
9234   // Skip to start of data.
9235   addptr(rdi, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
9236 
9237   // Scan RCX words at [RDI] for an occurrence of RAX.
9238   // Set NZ/Z based on last compare.
9239   // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
9240   // not change flags (only scas instruction which is repeated sets flags).
9241   // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
9242 #ifdef _LP64
9243   // This part is tricky, as values in supers array could be 32 or 64 bit wide
9244   // and we store values in objArrays always encoded, thus we need to encode
9245   // the value of rax before repne.  Note that rax is dead after the repne.
9246   if (UseCompressedOops) {
9247     encode_heap_oop_not_null(rax); // Changes flags.
9248     // The superclass is never null; it would be a basic system error if a null
9249     // pointer were to sneak in here.  Note that we have already loaded the
9250     // Klass::super_check_offset from the super_klass in the fast path,
9251     // so if there is a null in that register, we are already in the afterlife.
9252     testl(rax,rax); // Set Z = 0
9253     repne_scanl();
9254   } else
9255 #endif // _LP64
9256   {
9257     testptr(rax,rax); // Set Z = 0
9258     repne_scan();
9259   }
9260   // Unspill the temp. registers:
9261   if (pushed_rdi)  pop(rdi);
9262   if (pushed_rcx)  pop(rcx);
9263   if (pushed_rax)  pop(rax);
9264 
9265   if (set_cond_codes) {
9266     // Special hack for the AD files:  rdi is guaranteed non-zero.
9267     assert(!pushed_rdi, "rdi must be left non-NULL");
9268     // Also, the condition codes are properly set Z/NZ on succeed/failure.
9269   }
9270 
9271   if (L_failure == &L_fallthrough)
9272         jccb(Assembler::notEqual, *L_failure);
9273   else  jcc(Assembler::notEqual, *L_failure);
9274 
9275   // Success.  Cache the super we found and proceed in triumph.
9276   movptr(super_cache_addr, super_klass);
9277 
9278   if (L_success != &L_fallthrough) {
9279     jmp(*L_success);
9280   }
9281 
9282 #undef IS_A_TEMP
9283 
9284   bind(L_fallthrough);
9285 }
9286 
9287 
9288 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
9289   if (VM_Version::supports_cmov()) {
9290     cmovl(cc, dst, src);
9291   } else {
9292     Label L;
9293     jccb(negate_condition(cc), L);
9294     movl(dst, src);
9295     bind(L);
9296   }
9297 }
9298 
9299 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
9300   if (VM_Version::supports_cmov()) {
9301     cmovl(cc, dst, src);
9302   } else {
9303     Label L;
9304     jccb(negate_condition(cc), L);
9305     movl(dst, src);
9306     bind(L);
9307   }
9308 }
9309 
9310 void MacroAssembler::verify_oop(Register reg, const char* s) {
9311   if (!VerifyOops) return;
9312 
9313   // Pass register number to verify_oop_subroutine
9314   char* b = new char[strlen(s) + 50];
9315   sprintf(b, "verify_oop: %s: %s", reg->name(), s);
9316 #ifdef _LP64
9317   push(rscratch1);                    // save r10, trashed by movptr()
9318 #endif
9319   push(rax);                          // save rax,
9320   push(reg);                          // pass register argument
9321   ExternalAddress buffer((address) b);
9322   // avoid using pushptr, as it modifies scratch registers
9323   // and our contract is not to modify anything
9324   movptr(rax, buffer.addr());
9325   push(rax);
9326   // call indirectly to solve generation ordering problem
9327   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
9328   call(rax);
9329   // Caller pops the arguments (oop, message) and restores rax, r10
9330 }
9331 
9332 
9333 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
9334                                                       Register tmp,
9335                                                       int offset) {
9336   intptr_t value = *delayed_value_addr;
9337   if (value != 0)
9338     return RegisterOrConstant(value + offset);
9339 
9340   // load indirectly to solve generation ordering problem
9341   movptr(tmp, ExternalAddress((address) delayed_value_addr));
9342 
9343 #ifdef ASSERT
9344   { Label L;
9345     testptr(tmp, tmp);
9346     if (WizardMode) {
9347       jcc(Assembler::notZero, L);
9348       char* buf = new char[40];
9349       sprintf(buf, "DelayedValue="INTPTR_FORMAT, delayed_value_addr[1]);
9350       stop(buf);
9351     } else {
9352       jccb(Assembler::notZero, L);
9353       hlt();
9354     }
9355     bind(L);
9356   }
9357 #endif
9358 
9359   if (offset != 0)
9360     addptr(tmp, offset);
9361 
9362   return RegisterOrConstant(tmp);
9363 }
9364 
9365 
9366 // registers on entry:
9367 //  - rax ('check' register): required MethodType
9368 //  - rcx: method handle
9369 //  - rdx, rsi, or ?: killable temp
9370 void MacroAssembler::check_method_handle_type(Register mtype_reg, Register mh_reg,
9371                                               Register temp_reg,
9372                                               Label& wrong_method_type) {
9373   Address type_addr(mh_reg, delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg));
9374   // compare method type against that of the receiver
9375   if (UseCompressedOops) {
9376     load_heap_oop(temp_reg, type_addr);
9377     cmpptr(mtype_reg, temp_reg);
9378   } else {
9379     cmpptr(mtype_reg, type_addr);
9380   }
9381   jcc(Assembler::notEqual, wrong_method_type);
9382 }
9383 
9384 
9385 // A method handle has a "vmslots" field which gives the size of its
9386 // argument list in JVM stack slots.  This field is either located directly
9387 // in every method handle, or else is indirectly accessed through the
9388 // method handle's MethodType.  This macro hides the distinction.
9389 void MacroAssembler::load_method_handle_vmslots(Register vmslots_reg, Register mh_reg,
9390                                                 Register temp_reg) {
9391   assert_different_registers(vmslots_reg, mh_reg, temp_reg);
9392   // load mh.type.form.vmslots
9393   Register temp2_reg = vmslots_reg;
9394   load_heap_oop(temp2_reg, Address(mh_reg,    delayed_value(java_lang_invoke_MethodHandle::type_offset_in_bytes, temp_reg)));
9395   load_heap_oop(temp2_reg, Address(temp2_reg, delayed_value(java_lang_invoke_MethodType::form_offset_in_bytes, temp_reg)));
9396   movl(vmslots_reg, Address(temp2_reg, delayed_value(java_lang_invoke_MethodTypeForm::vmslots_offset_in_bytes, temp_reg)));
9397 }
9398 
9399 
9400 // registers on entry:
9401 //  - rcx: method handle
9402 //  - rdx: killable temp (interpreted only)
9403 //  - rax: killable temp (compiled only)
9404 void MacroAssembler::jump_to_method_handle_entry(Register mh_reg, Register temp_reg) {
9405   assert(mh_reg == rcx, "caller must put MH object in rcx");
9406   assert_different_registers(mh_reg, temp_reg);
9407 
9408   // pick out the interpreted side of the handler
9409   // NOTE: vmentry is not an oop!
9410   movptr(temp_reg, Address(mh_reg, delayed_value(java_lang_invoke_MethodHandle::vmentry_offset_in_bytes, temp_reg)));
9411 
9412   // off we go...
9413   jmp(Address(temp_reg, MethodHandleEntry::from_interpreted_entry_offset_in_bytes()));
9414 
9415   // for the various stubs which take control at this point,
9416   // see MethodHandles::generate_method_handle_stub
9417 }
9418 
9419 
9420 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
9421                                          int extra_slot_offset) {
9422   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
9423   int stackElementSize = Interpreter::stackElementSize;
9424   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
9425 #ifdef ASSERT
9426   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
9427   assert(offset1 - offset == stackElementSize, "correct arithmetic");
9428 #endif
9429   Register             scale_reg    = noreg;
9430   Address::ScaleFactor scale_factor = Address::no_scale;
9431   if (arg_slot.is_constant()) {
9432     offset += arg_slot.as_constant() * stackElementSize;
9433   } else {
9434     scale_reg    = arg_slot.as_register();
9435     scale_factor = Address::times(stackElementSize);
9436   }
9437   offset += wordSize;           // return PC is on stack
9438   return Address(rsp, scale_reg, scale_factor, offset);
9439 }
9440 
9441 
9442 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
9443   if (!VerifyOops) return;
9444 
9445   // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
9446   // Pass register number to verify_oop_subroutine
9447   char* b = new char[strlen(s) + 50];
9448   sprintf(b, "verify_oop_addr: %s", s);
9449 
9450 #ifdef _LP64
9451   push(rscratch1);                    // save r10, trashed by movptr()
9452 #endif
9453   push(rax);                          // save rax,
9454   // addr may contain rsp so we will have to adjust it based on the push
9455   // we just did (and on 64 bit we do two pushes)
9456   // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
9457   // stores rax into addr which is backwards of what was intended.
9458   if (addr.uses(rsp)) {
9459     lea(rax, addr);
9460     pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
9461   } else {
9462     pushptr(addr);
9463   }
9464 
9465   ExternalAddress buffer((address) b);
9466   // pass msg argument
9467   // avoid using pushptr, as it modifies scratch registers
9468   // and our contract is not to modify anything
9469   movptr(rax, buffer.addr());
9470   push(rax);
9471 
9472   // call indirectly to solve generation ordering problem
9473   movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
9474   call(rax);
9475   // Caller pops the arguments (addr, message) and restores rax, r10.
9476 }
9477 
9478 void MacroAssembler::verify_tlab() {
9479 #ifdef ASSERT
9480   if (UseTLAB && VerifyOops) {
9481     Label next, ok;
9482     Register t1 = rsi;
9483     Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
9484 
9485     push(t1);
9486     NOT_LP64(push(thread_reg));
9487     NOT_LP64(get_thread(thread_reg));
9488 
9489     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
9490     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
9491     jcc(Assembler::aboveEqual, next);
9492     stop("assert(top >= start)");
9493     should_not_reach_here();
9494 
9495     bind(next);
9496     movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
9497     cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
9498     jcc(Assembler::aboveEqual, ok);
9499     stop("assert(top <= end)");
9500     should_not_reach_here();
9501 
9502     bind(ok);
9503     NOT_LP64(pop(thread_reg));
9504     pop(t1);
9505   }
9506 #endif
9507 }
9508 
9509 class ControlWord {
9510  public:
9511   int32_t _value;
9512 
9513   int  rounding_control() const        { return  (_value >> 10) & 3      ; }
9514   int  precision_control() const       { return  (_value >>  8) & 3      ; }
9515   bool precision() const               { return ((_value >>  5) & 1) != 0; }
9516   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
9517   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
9518   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
9519   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
9520   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
9521 
9522   void print() const {
9523     // rounding control
9524     const char* rc;
9525     switch (rounding_control()) {
9526       case 0: rc = "round near"; break;
9527       case 1: rc = "round down"; break;
9528       case 2: rc = "round up  "; break;
9529       case 3: rc = "chop      "; break;
9530     };
9531     // precision control
9532     const char* pc;
9533     switch (precision_control()) {
9534       case 0: pc = "24 bits "; break;
9535       case 1: pc = "reserved"; break;
9536       case 2: pc = "53 bits "; break;
9537       case 3: pc = "64 bits "; break;
9538     };
9539     // flags
9540     char f[9];
9541     f[0] = ' ';
9542     f[1] = ' ';
9543     f[2] = (precision   ()) ? 'P' : 'p';
9544     f[3] = (underflow   ()) ? 'U' : 'u';
9545     f[4] = (overflow    ()) ? 'O' : 'o';
9546     f[5] = (zero_divide ()) ? 'Z' : 'z';
9547     f[6] = (denormalized()) ? 'D' : 'd';
9548     f[7] = (invalid     ()) ? 'I' : 'i';
9549     f[8] = '\x0';
9550     // output
9551     printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
9552   }
9553 
9554 };
9555 
9556 class StatusWord {
9557  public:
9558   int32_t _value;
9559 
9560   bool busy() const                    { return ((_value >> 15) & 1) != 0; }
9561   bool C3() const                      { return ((_value >> 14) & 1) != 0; }
9562   bool C2() const                      { return ((_value >> 10) & 1) != 0; }
9563   bool C1() const                      { return ((_value >>  9) & 1) != 0; }
9564   bool C0() const                      { return ((_value >>  8) & 1) != 0; }
9565   int  top() const                     { return  (_value >> 11) & 7      ; }
9566   bool error_status() const            { return ((_value >>  7) & 1) != 0; }
9567   bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
9568   bool precision() const               { return ((_value >>  5) & 1) != 0; }
9569   bool underflow() const               { return ((_value >>  4) & 1) != 0; }
9570   bool overflow() const                { return ((_value >>  3) & 1) != 0; }
9571   bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
9572   bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
9573   bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
9574 
9575   void print() const {
9576     // condition codes
9577     char c[5];
9578     c[0] = (C3()) ? '3' : '-';
9579     c[1] = (C2()) ? '2' : '-';
9580     c[2] = (C1()) ? '1' : '-';
9581     c[3] = (C0()) ? '0' : '-';
9582     c[4] = '\x0';
9583     // flags
9584     char f[9];
9585     f[0] = (error_status()) ? 'E' : '-';
9586     f[1] = (stack_fault ()) ? 'S' : '-';
9587     f[2] = (precision   ()) ? 'P' : '-';
9588     f[3] = (underflow   ()) ? 'U' : '-';
9589     f[4] = (overflow    ()) ? 'O' : '-';
9590     f[5] = (zero_divide ()) ? 'Z' : '-';
9591     f[6] = (denormalized()) ? 'D' : '-';
9592     f[7] = (invalid     ()) ? 'I' : '-';
9593     f[8] = '\x0';
9594     // output
9595     printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
9596   }
9597 
9598 };
9599 
9600 class TagWord {
9601  public:
9602   int32_t _value;
9603 
9604   int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
9605 
9606   void print() const {
9607     printf("%04x", _value & 0xFFFF);
9608   }
9609 
9610 };
9611 
9612 class FPU_Register {
9613  public:
9614   int32_t _m0;
9615   int32_t _m1;
9616   int16_t _ex;
9617 
9618   bool is_indefinite() const           {
9619     return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
9620   }
9621 
9622   void print() const {
9623     char  sign = (_ex < 0) ? '-' : '+';
9624     const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
9625     printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
9626   };
9627 
9628 };
9629 
9630 class FPU_State {
9631  public:
9632   enum {
9633     register_size       = 10,
9634     number_of_registers =  8,
9635     register_mask       =  7
9636   };
9637 
9638   ControlWord  _control_word;
9639   StatusWord   _status_word;
9640   TagWord      _tag_word;
9641   int32_t      _error_offset;
9642   int32_t      _error_selector;
9643   int32_t      _data_offset;
9644   int32_t      _data_selector;
9645   int8_t       _register[register_size * number_of_registers];
9646 
9647   int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
9648   FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
9649 
9650   const char* tag_as_string(int tag) const {
9651     switch (tag) {
9652       case 0: return "valid";
9653       case 1: return "zero";
9654       case 2: return "special";
9655       case 3: return "empty";
9656     }
9657     ShouldNotReachHere();
9658     return NULL;
9659   }
9660 
9661   void print() const {
9662     // print computation registers
9663     { int t = _status_word.top();
9664       for (int i = 0; i < number_of_registers; i++) {
9665         int j = (i - t) & register_mask;
9666         printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
9667         st(j)->print();
9668         printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
9669       }
9670     }
9671     printf("\n");
9672     // print control registers
9673     printf("ctrl = "); _control_word.print(); printf("\n");
9674     printf("stat = "); _status_word .print(); printf("\n");
9675     printf("tags = "); _tag_word    .print(); printf("\n");
9676   }
9677 
9678 };
9679 
9680 class Flag_Register {
9681  public:
9682   int32_t _value;
9683 
9684   bool overflow() const                { return ((_value >> 11) & 1) != 0; }
9685   bool direction() const               { return ((_value >> 10) & 1) != 0; }
9686   bool sign() const                    { return ((_value >>  7) & 1) != 0; }
9687   bool zero() const                    { return ((_value >>  6) & 1) != 0; }
9688   bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
9689   bool parity() const                  { return ((_value >>  2) & 1) != 0; }
9690   bool carry() const                   { return ((_value >>  0) & 1) != 0; }
9691 
9692   void print() const {
9693     // flags
9694     char f[8];
9695     f[0] = (overflow       ()) ? 'O' : '-';
9696     f[1] = (direction      ()) ? 'D' : '-';
9697     f[2] = (sign           ()) ? 'S' : '-';
9698     f[3] = (zero           ()) ? 'Z' : '-';
9699     f[4] = (auxiliary_carry()) ? 'A' : '-';
9700     f[5] = (parity         ()) ? 'P' : '-';
9701     f[6] = (carry          ()) ? 'C' : '-';
9702     f[7] = '\x0';
9703     // output
9704     printf("%08x  flags = %s", _value, f);
9705   }
9706 
9707 };
9708 
9709 class IU_Register {
9710  public:
9711   int32_t _value;
9712 
9713   void print() const {
9714     printf("%08x  %11d", _value, _value);
9715   }
9716 
9717 };
9718 
9719 class IU_State {
9720  public:
9721   Flag_Register _eflags;
9722   IU_Register   _rdi;
9723   IU_Register   _rsi;
9724   IU_Register   _rbp;
9725   IU_Register   _rsp;
9726   IU_Register   _rbx;
9727   IU_Register   _rdx;
9728   IU_Register   _rcx;
9729   IU_Register   _rax;
9730 
9731   void print() const {
9732     // computation registers
9733     printf("rax,  = "); _rax.print(); printf("\n");
9734     printf("rbx,  = "); _rbx.print(); printf("\n");
9735     printf("rcx  = "); _rcx.print(); printf("\n");
9736     printf("rdx  = "); _rdx.print(); printf("\n");
9737     printf("rdi  = "); _rdi.print(); printf("\n");
9738     printf("rsi  = "); _rsi.print(); printf("\n");
9739     printf("rbp,  = "); _rbp.print(); printf("\n");
9740     printf("rsp  = "); _rsp.print(); printf("\n");
9741     printf("\n");
9742     // control registers
9743     printf("flgs = "); _eflags.print(); printf("\n");
9744   }
9745 };
9746 
9747 
9748 class CPU_State {
9749  public:
9750   FPU_State _fpu_state;
9751   IU_State  _iu_state;
9752 
9753   void print() const {
9754     printf("--------------------------------------------------\n");
9755     _iu_state .print();
9756     printf("\n");
9757     _fpu_state.print();
9758     printf("--------------------------------------------------\n");
9759   }
9760 
9761 };
9762 
9763 
9764 static void _print_CPU_state(CPU_State* state) {
9765   state->print();
9766 };
9767 
9768 
9769 void MacroAssembler::print_CPU_state() {
9770   push_CPU_state();
9771   push(rsp);                // pass CPU state
9772   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
9773   addptr(rsp, wordSize);       // discard argument
9774   pop_CPU_state();
9775 }
9776 
9777 
9778 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
9779   static int counter = 0;
9780   FPU_State* fs = &state->_fpu_state;
9781   counter++;
9782   // For leaf calls, only verify that the top few elements remain empty.
9783   // We only need 1 empty at the top for C2 code.
9784   if( stack_depth < 0 ) {
9785     if( fs->tag_for_st(7) != 3 ) {
9786       printf("FPR7 not empty\n");
9787       state->print();
9788       assert(false, "error");
9789       return false;
9790     }
9791     return true;                // All other stack states do not matter
9792   }
9793 
9794   assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
9795          "bad FPU control word");
9796 
9797   // compute stack depth
9798   int i = 0;
9799   while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
9800   int d = i;
9801   while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
9802   // verify findings
9803   if (i != FPU_State::number_of_registers) {
9804     // stack not contiguous
9805     printf("%s: stack not contiguous at ST%d\n", s, i);
9806     state->print();
9807     assert(false, "error");
9808     return false;
9809   }
9810   // check if computed stack depth corresponds to expected stack depth
9811   if (stack_depth < 0) {
9812     // expected stack depth is -stack_depth or less
9813     if (d > -stack_depth) {
9814       // too many elements on the stack
9815       printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
9816       state->print();
9817       assert(false, "error");
9818       return false;
9819     }
9820   } else {
9821     // expected stack depth is stack_depth
9822     if (d != stack_depth) {
9823       // wrong stack depth
9824       printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
9825       state->print();
9826       assert(false, "error");
9827       return false;
9828     }
9829   }
9830   // everything is cool
9831   return true;
9832 }
9833 
9834 
9835 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
9836   if (!VerifyFPU) return;
9837   push_CPU_state();
9838   push(rsp);                // pass CPU state
9839   ExternalAddress msg((address) s);
9840   // pass message string s
9841   pushptr(msg.addr());
9842   push(stack_depth);        // pass stack depth
9843   call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
9844   addptr(rsp, 3 * wordSize);   // discard arguments
9845   // check for error
9846   { Label L;
9847     testl(rax, rax);
9848     jcc(Assembler::notZero, L);
9849     int3();                  // break if error condition
9850     bind(L);
9851   }
9852   pop_CPU_state();
9853 }
9854 
9855 void MacroAssembler::load_klass(Register dst, Register src) {
9856 #ifdef _LP64
9857   if (UseCompressedOops) {
9858     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
9859     decode_heap_oop_not_null(dst);
9860   } else
9861 #endif
9862     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
9863 }
9864 
9865 void MacroAssembler::load_prototype_header(Register dst, Register src) {
9866 #ifdef _LP64
9867   if (UseCompressedOops) {
9868     assert (Universe::heap() != NULL, "java heap should be initialized");
9869     movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
9870     if (Universe::narrow_oop_shift() != 0) {
9871       assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
9872       if (LogMinObjAlignmentInBytes == Address::times_8) {
9873         movq(dst, Address(r12_heapbase, dst, Address::times_8, Klass::prototype_header_offset()));
9874       } else {
9875         // OK to use shift since we don't need to preserve flags.
9876         shlq(dst, LogMinObjAlignmentInBytes);
9877         movq(dst, Address(r12_heapbase, dst, Address::times_1, Klass::prototype_header_offset()));
9878       }
9879     } else {
9880       movq(dst, Address(dst, Klass::prototype_header_offset()));
9881     }
9882   } else
9883 #endif
9884   {
9885     movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
9886     movptr(dst, Address(dst, Klass::prototype_header_offset()));
9887   }
9888 }
9889 
9890 void MacroAssembler::store_klass(Register dst, Register src) {
9891 #ifdef _LP64
9892   if (UseCompressedOops) {
9893     encode_heap_oop_not_null(src);
9894     movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
9895   } else
9896 #endif
9897     movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
9898 }
9899 
9900 void MacroAssembler::load_heap_oop(Register dst, Address src) {
9901 #ifdef _LP64
9902   if (UseCompressedOops) {
9903     movl(dst, src);
9904     decode_heap_oop(dst);
9905   } else
9906 #endif
9907     movptr(dst, src);
9908 }
9909 
9910 // Doesn't do verfication, generates fixed size code
9911 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src) {
9912 #ifdef _LP64
9913   if (UseCompressedOops) {
9914     movl(dst, src);
9915     decode_heap_oop_not_null(dst);
9916   } else
9917 #endif
9918     movptr(dst, src);
9919 }
9920 
9921 void MacroAssembler::store_heap_oop(Address dst, Register src) {
9922 #ifdef _LP64
9923   if (UseCompressedOops) {
9924     assert(!dst.uses(src), "not enough registers");
9925     encode_heap_oop(src);
9926     movl(dst, src);
9927   } else
9928 #endif
9929     movptr(dst, src);
9930 }
9931 
9932 // Used for storing NULLs.
9933 void MacroAssembler::store_heap_oop_null(Address dst) {
9934 #ifdef _LP64
9935   if (UseCompressedOops) {
9936     movl(dst, (int32_t)NULL_WORD);
9937   } else {
9938     movslq(dst, (int32_t)NULL_WORD);
9939   }
9940 #else
9941   movl(dst, (int32_t)NULL_WORD);
9942 #endif
9943 }
9944 
9945 #ifdef _LP64
9946 void MacroAssembler::store_klass_gap(Register dst, Register src) {
9947   if (UseCompressedOops) {
9948     // Store to klass gap in destination
9949     movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
9950   }
9951 }
9952 
9953 #ifdef ASSERT
9954 void MacroAssembler::verify_heapbase(const char* msg) {
9955   assert (UseCompressedOops, "should be compressed");
9956   assert (Universe::heap() != NULL, "java heap should be initialized");
9957   if (CheckCompressedOops) {
9958     Label ok;
9959     push(rscratch1); // cmpptr trashes rscratch1
9960     cmpptr(r12_heapbase, ExternalAddress((address)Universe::narrow_oop_base_addr()));
9961     jcc(Assembler::equal, ok);
9962     stop(msg);
9963     bind(ok);
9964     pop(rscratch1);
9965   }
9966 }
9967 #endif
9968 
9969 // Algorithm must match oop.inline.hpp encode_heap_oop.
9970 void MacroAssembler::encode_heap_oop(Register r) {
9971 #ifdef ASSERT
9972   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
9973 #endif
9974   verify_oop(r, "broken oop in encode_heap_oop");
9975   if (Universe::narrow_oop_base() == NULL) {
9976     if (Universe::narrow_oop_shift() != 0) {
9977       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
9978       shrq(r, LogMinObjAlignmentInBytes);
9979     }
9980     return;
9981   }
9982   testq(r, r);
9983   cmovq(Assembler::equal, r, r12_heapbase);
9984   subq(r, r12_heapbase);
9985   shrq(r, LogMinObjAlignmentInBytes);
9986 }
9987 
9988 void MacroAssembler::encode_heap_oop_not_null(Register r) {
9989 #ifdef ASSERT
9990   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
9991   if (CheckCompressedOops) {
9992     Label ok;
9993     testq(r, r);
9994     jcc(Assembler::notEqual, ok);
9995     stop("null oop passed to encode_heap_oop_not_null");
9996     bind(ok);
9997   }
9998 #endif
9999   verify_oop(r, "broken oop in encode_heap_oop_not_null");
10000   if (Universe::narrow_oop_base() != NULL) {
10001     subq(r, r12_heapbase);
10002   }
10003   if (Universe::narrow_oop_shift() != 0) {
10004     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
10005     shrq(r, LogMinObjAlignmentInBytes);
10006   }
10007 }
10008 
10009 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
10010 #ifdef ASSERT
10011   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
10012   if (CheckCompressedOops) {
10013     Label ok;
10014     testq(src, src);
10015     jcc(Assembler::notEqual, ok);
10016     stop("null oop passed to encode_heap_oop_not_null2");
10017     bind(ok);
10018   }
10019 #endif
10020   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
10021   if (dst != src) {
10022     movq(dst, src);
10023   }
10024   if (Universe::narrow_oop_base() != NULL) {
10025     subq(dst, r12_heapbase);
10026   }
10027   if (Universe::narrow_oop_shift() != 0) {
10028     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
10029     shrq(dst, LogMinObjAlignmentInBytes);
10030   }
10031 }
10032 
10033 void  MacroAssembler::decode_heap_oop(Register r) {
10034 #ifdef ASSERT
10035   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
10036 #endif
10037   if (Universe::narrow_oop_base() == NULL) {
10038     if (Universe::narrow_oop_shift() != 0) {
10039       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
10040       shlq(r, LogMinObjAlignmentInBytes);
10041     }
10042   } else {
10043     Label done;
10044     shlq(r, LogMinObjAlignmentInBytes);
10045     jccb(Assembler::equal, done);
10046     addq(r, r12_heapbase);
10047     bind(done);
10048   }
10049   verify_oop(r, "broken oop in decode_heap_oop");
10050 }
10051 
10052 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
10053   // Note: it will change flags
10054   assert (UseCompressedOops, "should only be used for compressed headers");
10055   assert (Universe::heap() != NULL, "java heap should be initialized");
10056   // Cannot assert, unverified entry point counts instructions (see .ad file)
10057   // vtableStubs also counts instructions in pd_code_size_limit.
10058   // Also do not verify_oop as this is called by verify_oop.
10059   if (Universe::narrow_oop_shift() != 0) {
10060     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
10061     shlq(r, LogMinObjAlignmentInBytes);
10062     if (Universe::narrow_oop_base() != NULL) {
10063       addq(r, r12_heapbase);
10064     }
10065   } else {
10066     assert (Universe::narrow_oop_base() == NULL, "sanity");
10067   }
10068 }
10069 
10070 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
10071   // Note: it will change flags
10072   assert (UseCompressedOops, "should only be used for compressed headers");
10073   assert (Universe::heap() != NULL, "java heap should be initialized");
10074   // Cannot assert, unverified entry point counts instructions (see .ad file)
10075   // vtableStubs also counts instructions in pd_code_size_limit.
10076   // Also do not verify_oop as this is called by verify_oop.
10077   if (Universe::narrow_oop_shift() != 0) {
10078     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
10079     if (LogMinObjAlignmentInBytes == Address::times_8) {
10080       leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
10081     } else {
10082       if (dst != src) {
10083         movq(dst, src);
10084       }
10085       shlq(dst, LogMinObjAlignmentInBytes);
10086       if (Universe::narrow_oop_base() != NULL) {
10087         addq(dst, r12_heapbase);
10088       }
10089     }
10090   } else {
10091     assert (Universe::narrow_oop_base() == NULL, "sanity");
10092     if (dst != src) {
10093       movq(dst, src);
10094     }
10095   }
10096 }
10097 
10098 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
10099   assert (UseCompressedOops, "should only be used for compressed headers");
10100   assert (Universe::heap() != NULL, "java heap should be initialized");
10101   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
10102   int oop_index = oop_recorder()->find_index(obj);
10103   RelocationHolder rspec = oop_Relocation::spec(oop_index);
10104   mov_narrow_oop(dst, oop_index, rspec);
10105 }
10106 
10107 void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
10108   assert (UseCompressedOops, "should only be used for compressed headers");
10109   assert (Universe::heap() != NULL, "java heap should be initialized");
10110   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
10111   int oop_index = oop_recorder()->find_index(obj);
10112   RelocationHolder rspec = oop_Relocation::spec(oop_index);
10113   mov_narrow_oop(dst, oop_index, rspec);
10114 }
10115 
10116 void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
10117   assert (UseCompressedOops, "should only be used for compressed headers");
10118   assert (Universe::heap() != NULL, "java heap should be initialized");
10119   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
10120   int oop_index = oop_recorder()->find_index(obj);
10121   RelocationHolder rspec = oop_Relocation::spec(oop_index);
10122   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
10123 }
10124 
10125 void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
10126   assert (UseCompressedOops, "should only be used for compressed headers");
10127   assert (Universe::heap() != NULL, "java heap should be initialized");
10128   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
10129   int oop_index = oop_recorder()->find_index(obj);
10130   RelocationHolder rspec = oop_Relocation::spec(oop_index);
10131   Assembler::cmp_narrow_oop(dst, oop_index, rspec);
10132 }
10133 
10134 void MacroAssembler::reinit_heapbase() {
10135   if (UseCompressedOops) {
10136     movptr(r12_heapbase, ExternalAddress((address)Universe::narrow_oop_base_addr()));
10137   }
10138 }
10139 #endif // _LP64
10140 
10141 
10142 // C2 compiled method's prolog code.
10143 void MacroAssembler::verified_entry(int framesize, bool stack_bang, bool fp_mode_24b) {
10144 
10145   // WARNING: Initial instruction MUST be 5 bytes or longer so that
10146   // NativeJump::patch_verified_entry will be able to patch out the entry
10147   // code safely. The push to verify stack depth is ok at 5 bytes,
10148   // the frame allocation can be either 3 or 6 bytes. So if we don't do
10149   // stack bang then we must use the 6 byte frame allocation even if
10150   // we have no frame. :-(
10151 
10152   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
10153   // Remove word for return addr
10154   framesize -= wordSize;
10155 
10156   // Calls to C2R adapters often do not accept exceptional returns.
10157   // We require that their callers must bang for them.  But be careful, because
10158   // some VM calls (such as call site linkage) can use several kilobytes of
10159   // stack.  But the stack safety zone should account for that.
10160   // See bugs 4446381, 4468289, 4497237.
10161   if (stack_bang) {
10162     generate_stack_overflow_check(framesize);
10163 
10164     // We always push rbp, so that on return to interpreter rbp, will be
10165     // restored correctly and we can correct the stack.
10166     push(rbp);
10167     // Remove word for ebp
10168     framesize -= wordSize;
10169 
10170     // Create frame
10171     if (framesize) {
10172       subptr(rsp, framesize);
10173     }
10174   } else {
10175     // Create frame (force generation of a 4 byte immediate value)
10176     subptr_imm32(rsp, framesize);
10177 
10178     // Save RBP register now.
10179     framesize -= wordSize;
10180     movptr(Address(rsp, framesize), rbp);
10181   }
10182 
10183   if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
10184     framesize -= wordSize;
10185     movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
10186   }
10187 
10188 #ifndef _LP64
10189   // If method sets FPU control word do it now
10190   if (fp_mode_24b) {
10191     fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
10192   }
10193   if (UseSSE >= 2 && VerifyFPU) {
10194     verify_FPU(0, "FPU stack must be clean on entry");
10195   }
10196 #endif
10197 
10198 #ifdef ASSERT
10199   if (VerifyStackAtCalls) {
10200     Label L;
10201     push(rax);
10202     mov(rax, rsp);
10203     andptr(rax, StackAlignmentInBytes-1);
10204     cmpptr(rax, StackAlignmentInBytes-wordSize);
10205     pop(rax);
10206     jcc(Assembler::equal, L);
10207     stop("Stack is not properly aligned!");
10208     bind(L);
10209   }
10210 #endif
10211 
10212 }
10213 
10214 
10215 // IndexOf for constant substrings with size >= 8 chars
10216 // which don't need to be loaded through stack.
10217 void MacroAssembler::string_indexofC8(Register str1, Register str2,
10218                                       Register cnt1, Register cnt2,
10219                                       int int_cnt2,  Register result,
10220                                       XMMRegister vec, Register tmp) {
10221   ShortBranchVerifier sbv(this);
10222   assert(UseSSE42Intrinsics, "SSE4.2 is required");
10223 
10224   // This method uses pcmpestri inxtruction with bound registers
10225   //   inputs:
10226   //     xmm - substring
10227   //     rax - substring length (elements count)
10228   //     mem - scanned string
10229   //     rdx - string length (elements count)
10230   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
10231   //   outputs:
10232   //     rcx - matched index in string
10233   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
10234 
10235   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
10236         RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
10237         MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
10238 
10239   // Note, inline_string_indexOf() generates checks:
10240   // if (substr.count > string.count) return -1;
10241   // if (substr.count == 0) return 0;
10242   assert(int_cnt2 >= 8, "this code isused only for cnt2 >= 8 chars");
10243 
10244   // Load substring.
10245   movdqu(vec, Address(str2, 0));
10246   movl(cnt2, int_cnt2);
10247   movptr(result, str1); // string addr
10248 
10249   if (int_cnt2 > 8) {
10250     jmpb(SCAN_TO_SUBSTR);
10251 
10252     // Reload substr for rescan, this code
10253     // is executed only for large substrings (> 8 chars)
10254     bind(RELOAD_SUBSTR);
10255     movdqu(vec, Address(str2, 0));
10256     negptr(cnt2); // Jumped here with negative cnt2, convert to positive
10257 
10258     bind(RELOAD_STR);
10259     // We came here after the beginning of the substring was
10260     // matched but the rest of it was not so we need to search
10261     // again. Start from the next element after the previous match.
10262 
10263     // cnt2 is number of substring reminding elements and
10264     // cnt1 is number of string reminding elements when cmp failed.
10265     // Restored cnt1 = cnt1 - cnt2 + int_cnt2
10266     subl(cnt1, cnt2);
10267     addl(cnt1, int_cnt2);
10268     movl(cnt2, int_cnt2); // Now restore cnt2
10269 
10270     decrementl(cnt1);     // Shift to next element
10271     cmpl(cnt1, cnt2);
10272     jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
10273 
10274     addptr(result, 2);
10275 
10276   } // (int_cnt2 > 8)
10277 
10278   // Scan string for start of substr in 16-byte vectors
10279   bind(SCAN_TO_SUBSTR);
10280   pcmpestri(vec, Address(result, 0), 0x0d);
10281   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
10282   subl(cnt1, 8);
10283   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
10284   cmpl(cnt1, cnt2);
10285   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
10286   addptr(result, 16);
10287   jmpb(SCAN_TO_SUBSTR);
10288 
10289   // Found a potential substr
10290   bind(FOUND_CANDIDATE);
10291   // Matched whole vector if first element matched (tmp(rcx) == 0).
10292   if (int_cnt2 == 8) {
10293     jccb(Assembler::overflow, RET_FOUND);    // OF == 1
10294   } else { // int_cnt2 > 8
10295     jccb(Assembler::overflow, FOUND_SUBSTR);
10296   }
10297   // After pcmpestri tmp(rcx) contains matched element index
10298   // Compute start addr of substr
10299   lea(result, Address(result, tmp, Address::times_2));
10300 
10301   // Make sure string is still long enough
10302   subl(cnt1, tmp);
10303   cmpl(cnt1, cnt2);
10304   if (int_cnt2 == 8) {
10305     jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
10306   } else { // int_cnt2 > 8
10307     jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
10308   }
10309   // Left less then substring.
10310 
10311   bind(RET_NOT_FOUND);
10312   movl(result, -1);
10313   jmpb(EXIT);
10314 
10315   if (int_cnt2 > 8) {
10316     // This code is optimized for the case when whole substring
10317     // is matched if its head is matched.
10318     bind(MATCH_SUBSTR_HEAD);
10319     pcmpestri(vec, Address(result, 0), 0x0d);
10320     // Reload only string if does not match
10321     jccb(Assembler::noOverflow, RELOAD_STR); // OF == 0
10322 
10323     Label CONT_SCAN_SUBSTR;
10324     // Compare the rest of substring (> 8 chars).
10325     bind(FOUND_SUBSTR);
10326     // First 8 chars are already matched.
10327     negptr(cnt2);
10328     addptr(cnt2, 8);
10329 
10330     bind(SCAN_SUBSTR);
10331     subl(cnt1, 8);
10332     cmpl(cnt2, -8); // Do not read beyond substring
10333     jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
10334     // Back-up strings to avoid reading beyond substring:
10335     // cnt1 = cnt1 - cnt2 + 8
10336     addl(cnt1, cnt2); // cnt2 is negative
10337     addl(cnt1, 8);
10338     movl(cnt2, 8); negptr(cnt2);
10339     bind(CONT_SCAN_SUBSTR);
10340     if (int_cnt2 < (int)G) {
10341       movdqu(vec, Address(str2, cnt2, Address::times_2, int_cnt2*2));
10342       pcmpestri(vec, Address(result, cnt2, Address::times_2, int_cnt2*2), 0x0d);
10343     } else {
10344       // calculate index in register to avoid integer overflow (int_cnt2*2)
10345       movl(tmp, int_cnt2);
10346       addptr(tmp, cnt2);
10347       movdqu(vec, Address(str2, tmp, Address::times_2, 0));
10348       pcmpestri(vec, Address(result, tmp, Address::times_2, 0), 0x0d);
10349     }
10350     // Need to reload strings pointers if not matched whole vector
10351     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
10352     addptr(cnt2, 8);
10353     jcc(Assembler::negative, SCAN_SUBSTR);
10354     // Fall through if found full substring
10355 
10356   } // (int_cnt2 > 8)
10357 
10358   bind(RET_FOUND);
10359   // Found result if we matched full small substring.
10360   // Compute substr offset
10361   subptr(result, str1);
10362   shrl(result, 1); // index
10363   bind(EXIT);
10364 
10365 } // string_indexofC8
10366 
10367 // Small strings are loaded through stack if they cross page boundary.
10368 void MacroAssembler::string_indexof(Register str1, Register str2,
10369                                     Register cnt1, Register cnt2,
10370                                     int int_cnt2,  Register result,
10371                                     XMMRegister vec, Register tmp) {
10372   ShortBranchVerifier sbv(this);
10373   assert(UseSSE42Intrinsics, "SSE4.2 is required");
10374   //
10375   // int_cnt2 is length of small (< 8 chars) constant substring
10376   // or (-1) for non constant substring in which case its length
10377   // is in cnt2 register.
10378   //
10379   // Note, inline_string_indexOf() generates checks:
10380   // if (substr.count > string.count) return -1;
10381   // if (substr.count == 0) return 0;
10382   //
10383   assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < 8), "should be != 0");
10384 
10385   // This method uses pcmpestri inxtruction with bound registers
10386   //   inputs:
10387   //     xmm - substring
10388   //     rax - substring length (elements count)
10389   //     mem - scanned string
10390   //     rdx - string length (elements count)
10391   //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
10392   //   outputs:
10393   //     rcx - matched index in string
10394   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
10395 
10396   Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
10397         RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
10398         FOUND_CANDIDATE;
10399 
10400   { //========================================================
10401     // We don't know where these strings are located
10402     // and we can't read beyond them. Load them through stack.
10403     Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
10404 
10405     movptr(tmp, rsp); // save old SP
10406 
10407     if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
10408       if (int_cnt2 == 1) {  // One char
10409         load_unsigned_short(result, Address(str2, 0));
10410         movdl(vec, result); // move 32 bits
10411       } else if (int_cnt2 == 2) { // Two chars
10412         movdl(vec, Address(str2, 0)); // move 32 bits
10413       } else if (int_cnt2 == 4) { // Four chars
10414         movq(vec, Address(str2, 0));  // move 64 bits
10415       } else { // cnt2 = { 3, 5, 6, 7 }
10416         // Array header size is 12 bytes in 32-bit VM
10417         // + 6 bytes for 3 chars == 18 bytes,
10418         // enough space to load vec and shift.
10419         assert(HeapWordSize*typeArrayKlass::header_size() >= 12,"sanity");
10420         movdqu(vec, Address(str2, (int_cnt2*2)-16));
10421         psrldq(vec, 16-(int_cnt2*2));
10422       }
10423     } else { // not constant substring
10424       cmpl(cnt2, 8);
10425       jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
10426 
10427       // We can read beyond string if srt+16 does not cross page boundary
10428       // since heaps are aligned and mapped by pages.
10429       assert(os::vm_page_size() < (int)G, "default page should be small");
10430       movl(result, str2); // We need only low 32 bits
10431       andl(result, (os::vm_page_size()-1));
10432       cmpl(result, (os::vm_page_size()-16));
10433       jccb(Assembler::belowEqual, CHECK_STR);
10434 
10435       // Move small strings to stack to allow load 16 bytes into vec.
10436       subptr(rsp, 16);
10437       int stk_offset = wordSize-2;
10438       push(cnt2);
10439 
10440       bind(COPY_SUBSTR);
10441       load_unsigned_short(result, Address(str2, cnt2, Address::times_2, -2));
10442       movw(Address(rsp, cnt2, Address::times_2, stk_offset), result);
10443       decrement(cnt2);
10444       jccb(Assembler::notZero, COPY_SUBSTR);
10445 
10446       pop(cnt2);
10447       movptr(str2, rsp);  // New substring address
10448     } // non constant
10449 
10450     bind(CHECK_STR);
10451     cmpl(cnt1, 8);
10452     jccb(Assembler::aboveEqual, BIG_STRINGS);
10453 
10454     // Check cross page boundary.
10455     movl(result, str1); // We need only low 32 bits
10456     andl(result, (os::vm_page_size()-1));
10457     cmpl(result, (os::vm_page_size()-16));
10458     jccb(Assembler::belowEqual, BIG_STRINGS);
10459 
10460     subptr(rsp, 16);
10461     int stk_offset = -2;
10462     if (int_cnt2 < 0) { // not constant
10463       push(cnt2);
10464       stk_offset += wordSize;
10465     }
10466     movl(cnt2, cnt1);
10467 
10468     bind(COPY_STR);
10469     load_unsigned_short(result, Address(str1, cnt2, Address::times_2, -2));
10470     movw(Address(rsp, cnt2, Address::times_2, stk_offset), result);
10471     decrement(cnt2);
10472     jccb(Assembler::notZero, COPY_STR);
10473 
10474     if (int_cnt2 < 0) { // not constant
10475       pop(cnt2);
10476     }
10477     movptr(str1, rsp);  // New string address
10478 
10479     bind(BIG_STRINGS);
10480     // Load substring.
10481     if (int_cnt2 < 0) { // -1
10482       movdqu(vec, Address(str2, 0));
10483       push(cnt2);       // substr count
10484       push(str2);       // substr addr
10485       push(str1);       // string addr
10486     } else {
10487       // Small (< 8 chars) constant substrings are loaded already.
10488       movl(cnt2, int_cnt2);
10489     }
10490     push(tmp);  // original SP
10491 
10492   } // Finished loading
10493 
10494   //========================================================
10495   // Start search
10496   //
10497 
10498   movptr(result, str1); // string addr
10499 
10500   if (int_cnt2  < 0) {  // Only for non constant substring
10501     jmpb(SCAN_TO_SUBSTR);
10502 
10503     // SP saved at sp+0
10504     // String saved at sp+1*wordSize
10505     // Substr saved at sp+2*wordSize
10506     // Substr count saved at sp+3*wordSize
10507 
10508     // Reload substr for rescan, this code
10509     // is executed only for large substrings (> 8 chars)
10510     bind(RELOAD_SUBSTR);
10511     movptr(str2, Address(rsp, 2*wordSize));
10512     movl(cnt2, Address(rsp, 3*wordSize));
10513     movdqu(vec, Address(str2, 0));
10514     // We came here after the beginning of the substring was
10515     // matched but the rest of it was not so we need to search
10516     // again. Start from the next element after the previous match.
10517     subptr(str1, result); // Restore counter
10518     shrl(str1, 1);
10519     addl(cnt1, str1);
10520     decrementl(cnt1);   // Shift to next element
10521     cmpl(cnt1, cnt2);
10522     jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
10523 
10524     addptr(result, 2);
10525   } // non constant
10526 
10527   // Scan string for start of substr in 16-byte vectors
10528   bind(SCAN_TO_SUBSTR);
10529   assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
10530   pcmpestri(vec, Address(result, 0), 0x0d);
10531   jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
10532   subl(cnt1, 8);
10533   jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
10534   cmpl(cnt1, cnt2);
10535   jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
10536   addptr(result, 16);
10537 
10538   bind(ADJUST_STR);
10539   cmpl(cnt1, 8); // Do not read beyond string
10540   jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
10541   // Back-up string to avoid reading beyond string.
10542   lea(result, Address(result, cnt1, Address::times_2, -16));
10543   movl(cnt1, 8);
10544   jmpb(SCAN_TO_SUBSTR);
10545 
10546   // Found a potential substr
10547   bind(FOUND_CANDIDATE);
10548   // After pcmpestri tmp(rcx) contains matched element index
10549 
10550   // Make sure string is still long enough
10551   subl(cnt1, tmp);
10552   cmpl(cnt1, cnt2);
10553   jccb(Assembler::greaterEqual, FOUND_SUBSTR);
10554   // Left less then substring.
10555 
10556   bind(RET_NOT_FOUND);
10557   movl(result, -1);
10558   jmpb(CLEANUP);
10559 
10560   bind(FOUND_SUBSTR);
10561   // Compute start addr of substr
10562   lea(result, Address(result, tmp, Address::times_2));
10563 
10564   if (int_cnt2 > 0) { // Constant substring
10565     // Repeat search for small substring (< 8 chars)
10566     // from new point without reloading substring.
10567     // Have to check that we don't read beyond string.
10568     cmpl(tmp, 8-int_cnt2);
10569     jccb(Assembler::greater, ADJUST_STR);
10570     // Fall through if matched whole substring.
10571   } else { // non constant
10572     assert(int_cnt2 == -1, "should be != 0");
10573 
10574     addl(tmp, cnt2);
10575     // Found result if we matched whole substring.
10576     cmpl(tmp, 8);
10577     jccb(Assembler::lessEqual, RET_FOUND);
10578 
10579     // Repeat search for small substring (<= 8 chars)
10580     // from new point 'str1' without reloading substring.
10581     cmpl(cnt2, 8);
10582     // Have to check that we don't read beyond string.
10583     jccb(Assembler::lessEqual, ADJUST_STR);
10584 
10585     Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
10586     // Compare the rest of substring (> 8 chars).
10587     movptr(str1, result);
10588 
10589     cmpl(tmp, cnt2);
10590     // First 8 chars are already matched.
10591     jccb(Assembler::equal, CHECK_NEXT);
10592 
10593     bind(SCAN_SUBSTR);
10594     pcmpestri(vec, Address(str1, 0), 0x0d);
10595     // Need to reload strings pointers if not matched whole vector
10596     jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
10597 
10598     bind(CHECK_NEXT);
10599     subl(cnt2, 8);
10600     jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
10601     addptr(str1, 16);
10602     addptr(str2, 16);
10603     subl(cnt1, 8);
10604     cmpl(cnt2, 8); // Do not read beyond substring
10605     jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
10606     // Back-up strings to avoid reading beyond substring.
10607     lea(str2, Address(str2, cnt2, Address::times_2, -16));
10608     lea(str1, Address(str1, cnt2, Address::times_2, -16));
10609     subl(cnt1, cnt2);
10610     movl(cnt2, 8);
10611     addl(cnt1, 8);
10612     bind(CONT_SCAN_SUBSTR);
10613     movdqu(vec, Address(str2, 0));
10614     jmpb(SCAN_SUBSTR);
10615 
10616     bind(RET_FOUND_LONG);
10617     movptr(str1, Address(rsp, wordSize));
10618   } // non constant
10619 
10620   bind(RET_FOUND);
10621   // Compute substr offset
10622   subptr(result, str1);
10623   shrl(result, 1); // index
10624 
10625   bind(CLEANUP);
10626   pop(rsp); // restore SP
10627 
10628 } // string_indexof
10629 
10630 // Compare strings.
10631 void MacroAssembler::string_compare(Register str1, Register str2,
10632                                     Register cnt1, Register cnt2, Register result,
10633                                     XMMRegister vec1) {
10634   ShortBranchVerifier sbv(this);
10635   Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
10636 
10637   // Compute the minimum of the string lengths and the
10638   // difference of the string lengths (stack).
10639   // Do the conditional move stuff
10640   movl(result, cnt1);
10641   subl(cnt1, cnt2);
10642   push(cnt1);
10643   cmov32(Assembler::lessEqual, cnt2, result);
10644 
10645   // Is the minimum length zero?
10646   testl(cnt2, cnt2);
10647   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
10648 
10649   // Load first characters
10650   load_unsigned_short(result, Address(str1, 0));
10651   load_unsigned_short(cnt1, Address(str2, 0));
10652 
10653   // Compare first characters
10654   subl(result, cnt1);
10655   jcc(Assembler::notZero,  POP_LABEL);
10656   decrementl(cnt2);
10657   jcc(Assembler::zero, LENGTH_DIFF_LABEL);
10658 
10659   {
10660     // Check after comparing first character to see if strings are equivalent
10661     Label LSkip2;
10662     // Check if the strings start at same location
10663     cmpptr(str1, str2);
10664     jccb(Assembler::notEqual, LSkip2);
10665 
10666     // Check if the length difference is zero (from stack)
10667     cmpl(Address(rsp, 0), 0x0);
10668     jcc(Assembler::equal,  LENGTH_DIFF_LABEL);
10669 
10670     // Strings might not be equivalent
10671     bind(LSkip2);
10672   }
10673 
10674   Address::ScaleFactor scale = Address::times_2;
10675   int stride = 8;
10676 
10677   // Advance to next element
10678   addptr(str1, 16/stride);
10679   addptr(str2, 16/stride);
10680 
10681   if (UseSSE42Intrinsics) {
10682     Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
10683     int pcmpmask = 0x19;
10684     // Setup to compare 16-byte vectors
10685     movl(result, cnt2);
10686     andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
10687     jccb(Assembler::zero, COMPARE_TAIL);
10688 
10689     lea(str1, Address(str1, result, scale));
10690     lea(str2, Address(str2, result, scale));
10691     negptr(result);
10692 
10693     // pcmpestri
10694     //   inputs:
10695     //     vec1- substring
10696     //     rax - negative string length (elements count)
10697     //     mem - scaned string
10698     //     rdx - string length (elements count)
10699     //     pcmpmask - cmp mode: 11000 (string compare with negated result)
10700     //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
10701     //   outputs:
10702     //     rcx - first mismatched element index
10703     assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
10704 
10705     bind(COMPARE_WIDE_VECTORS);
10706     movdqu(vec1, Address(str1, result, scale));
10707     pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
10708     // After pcmpestri cnt1(rcx) contains mismatched element index
10709 
10710     jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
10711     addptr(result, stride);
10712     subptr(cnt2, stride);
10713     jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
10714 
10715     // compare wide vectors tail
10716     testl(result, result);
10717     jccb(Assembler::zero, LENGTH_DIFF_LABEL);
10718 
10719     movl(cnt2, stride);
10720     movl(result, stride);
10721     negptr(result);
10722     movdqu(vec1, Address(str1, result, scale));
10723     pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
10724     jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
10725 
10726     // Mismatched characters in the vectors
10727     bind(VECTOR_NOT_EQUAL);
10728     addptr(result, cnt1);
10729     movptr(cnt2, result);
10730     load_unsigned_short(result, Address(str1, cnt2, scale));
10731     load_unsigned_short(cnt1, Address(str2, cnt2, scale));
10732     subl(result, cnt1);
10733     jmpb(POP_LABEL);
10734 
10735     bind(COMPARE_TAIL); // limit is zero
10736     movl(cnt2, result);
10737     // Fallthru to tail compare
10738   }
10739 
10740   // Shift str2 and str1 to the end of the arrays, negate min
10741   lea(str1, Address(str1, cnt2, scale, 0));
10742   lea(str2, Address(str2, cnt2, scale, 0));
10743   negptr(cnt2);
10744 
10745   // Compare the rest of the elements
10746   bind(WHILE_HEAD_LABEL);
10747   load_unsigned_short(result, Address(str1, cnt2, scale, 0));
10748   load_unsigned_short(cnt1, Address(str2, cnt2, scale, 0));
10749   subl(result, cnt1);
10750   jccb(Assembler::notZero, POP_LABEL);
10751   increment(cnt2);
10752   jccb(Assembler::notZero, WHILE_HEAD_LABEL);
10753 
10754   // Strings are equal up to min length.  Return the length difference.
10755   bind(LENGTH_DIFF_LABEL);
10756   pop(result);
10757   jmpb(DONE_LABEL);
10758 
10759   // Discard the stored length difference
10760   bind(POP_LABEL);
10761   pop(cnt1);
10762 
10763   // That's it
10764   bind(DONE_LABEL);
10765 }
10766 
10767 // Compare char[] arrays aligned to 4 bytes or substrings.
10768 void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Register ary2,
10769                                         Register limit, Register result, Register chr,
10770                                         XMMRegister vec1, XMMRegister vec2) {
10771   ShortBranchVerifier sbv(this);
10772   Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR;
10773 
10774   int length_offset  = arrayOopDesc::length_offset_in_bytes();
10775   int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
10776 
10777   // Check the input args
10778   cmpptr(ary1, ary2);
10779   jcc(Assembler::equal, TRUE_LABEL);
10780 
10781   if (is_array_equ) {
10782     // Need additional checks for arrays_equals.
10783     testptr(ary1, ary1);
10784     jcc(Assembler::zero, FALSE_LABEL);
10785     testptr(ary2, ary2);
10786     jcc(Assembler::zero, FALSE_LABEL);
10787 
10788     // Check the lengths
10789     movl(limit, Address(ary1, length_offset));
10790     cmpl(limit, Address(ary2, length_offset));
10791     jcc(Assembler::notEqual, FALSE_LABEL);
10792   }
10793 
10794   // count == 0
10795   testl(limit, limit);
10796   jcc(Assembler::zero, TRUE_LABEL);
10797 
10798   if (is_array_equ) {
10799     // Load array address
10800     lea(ary1, Address(ary1, base_offset));
10801     lea(ary2, Address(ary2, base_offset));
10802   }
10803 
10804   shll(limit, 1);      // byte count != 0
10805   movl(result, limit); // copy
10806 
10807   if (UseSSE42Intrinsics) {
10808     // With SSE4.2, use double quad vector compare
10809     Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
10810 
10811     // Compare 16-byte vectors
10812     andl(result, 0x0000000e);  //   tail count (in bytes)
10813     andl(limit, 0xfffffff0);   // vector count (in bytes)
10814     jccb(Assembler::zero, COMPARE_TAIL);
10815 
10816     lea(ary1, Address(ary1, limit, Address::times_1));
10817     lea(ary2, Address(ary2, limit, Address::times_1));
10818     negptr(limit);
10819 
10820     bind(COMPARE_WIDE_VECTORS);
10821     movdqu(vec1, Address(ary1, limit, Address::times_1));
10822     movdqu(vec2, Address(ary2, limit, Address::times_1));
10823     pxor(vec1, vec2);
10824 
10825     ptest(vec1, vec1);
10826     jccb(Assembler::notZero, FALSE_LABEL);
10827     addptr(limit, 16);
10828     jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
10829 
10830     testl(result, result);
10831     jccb(Assembler::zero, TRUE_LABEL);
10832 
10833     movdqu(vec1, Address(ary1, result, Address::times_1, -16));
10834     movdqu(vec2, Address(ary2, result, Address::times_1, -16));
10835     pxor(vec1, vec2);
10836 
10837     ptest(vec1, vec1);
10838     jccb(Assembler::notZero, FALSE_LABEL);
10839     jmpb(TRUE_LABEL);
10840 
10841     bind(COMPARE_TAIL); // limit is zero
10842     movl(limit, result);
10843     // Fallthru to tail compare
10844   }
10845 
10846   // Compare 4-byte vectors
10847   andl(limit, 0xfffffffc); // vector count (in bytes)
10848   jccb(Assembler::zero, COMPARE_CHAR);
10849 
10850   lea(ary1, Address(ary1, limit, Address::times_1));
10851   lea(ary2, Address(ary2, limit, Address::times_1));
10852   negptr(limit);
10853 
10854   bind(COMPARE_VECTORS);
10855   movl(chr, Address(ary1, limit, Address::times_1));
10856   cmpl(chr, Address(ary2, limit, Address::times_1));
10857   jccb(Assembler::notEqual, FALSE_LABEL);
10858   addptr(limit, 4);
10859   jcc(Assembler::notZero, COMPARE_VECTORS);
10860 
10861   // Compare trailing char (final 2 bytes), if any
10862   bind(COMPARE_CHAR);
10863   testl(result, 0x2);   // tail  char
10864   jccb(Assembler::zero, TRUE_LABEL);
10865   load_unsigned_short(chr, Address(ary1, 0));
10866   load_unsigned_short(limit, Address(ary2, 0));
10867   cmpl(chr, limit);
10868   jccb(Assembler::notEqual, FALSE_LABEL);
10869 
10870   bind(TRUE_LABEL);
10871   movl(result, 1);   // return true
10872   jmpb(DONE);
10873 
10874   bind(FALSE_LABEL);
10875   xorl(result, result); // return false
10876 
10877   // That's it
10878   bind(DONE);
10879 }
10880 
10881 #ifdef PRODUCT
10882 #define BLOCK_COMMENT(str) /* nothing */
10883 #else
10884 #define BLOCK_COMMENT(str) block_comment(str)
10885 #endif
10886 
10887 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
10888 void MacroAssembler::generate_fill(BasicType t, bool aligned,
10889                                    Register to, Register value, Register count,
10890                                    Register rtmp, XMMRegister xtmp) {
10891   ShortBranchVerifier sbv(this);
10892   assert_different_registers(to, value, count, rtmp);
10893   Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
10894   Label L_fill_2_bytes, L_fill_4_bytes;
10895 
10896   int shift = -1;
10897   switch (t) {
10898     case T_BYTE:
10899       shift = 2;
10900       break;
10901     case T_SHORT:
10902       shift = 1;
10903       break;
10904     case T_INT:
10905       shift = 0;
10906       break;
10907     default: ShouldNotReachHere();
10908   }
10909 
10910   if (t == T_BYTE) {
10911     andl(value, 0xff);
10912     movl(rtmp, value);
10913     shll(rtmp, 8);
10914     orl(value, rtmp);
10915   }
10916   if (t == T_SHORT) {
10917     andl(value, 0xffff);
10918   }
10919   if (t == T_BYTE || t == T_SHORT) {
10920     movl(rtmp, value);
10921     shll(rtmp, 16);
10922     orl(value, rtmp);
10923   }
10924 
10925   cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
10926   jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
10927   if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
10928     // align source address at 4 bytes address boundary
10929     if (t == T_BYTE) {
10930       // One byte misalignment happens only for byte arrays
10931       testptr(to, 1);
10932       jccb(Assembler::zero, L_skip_align1);
10933       movb(Address(to, 0), value);
10934       increment(to);
10935       decrement(count);
10936       BIND(L_skip_align1);
10937     }
10938     // Two bytes misalignment happens only for byte and short (char) arrays
10939     testptr(to, 2);
10940     jccb(Assembler::zero, L_skip_align2);
10941     movw(Address(to, 0), value);
10942     addptr(to, 2);
10943     subl(count, 1<<(shift-1));
10944     BIND(L_skip_align2);
10945   }
10946   if (UseSSE < 2) {
10947     Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
10948     // Fill 32-byte chunks
10949     subl(count, 8 << shift);
10950     jcc(Assembler::less, L_check_fill_8_bytes);
10951     align(16);
10952 
10953     BIND(L_fill_32_bytes_loop);
10954 
10955     for (int i = 0; i < 32; i += 4) {
10956       movl(Address(to, i), value);
10957     }
10958 
10959     addptr(to, 32);
10960     subl(count, 8 << shift);
10961     jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
10962     BIND(L_check_fill_8_bytes);
10963     addl(count, 8 << shift);
10964     jccb(Assembler::zero, L_exit);
10965     jmpb(L_fill_8_bytes);
10966 
10967     //
10968     // length is too short, just fill qwords
10969     //
10970     BIND(L_fill_8_bytes_loop);
10971     movl(Address(to, 0), value);
10972     movl(Address(to, 4), value);
10973     addptr(to, 8);
10974     BIND(L_fill_8_bytes);
10975     subl(count, 1 << (shift + 1));
10976     jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
10977     // fall through to fill 4 bytes
10978   } else {
10979     Label L_fill_32_bytes;
10980     if (!UseUnalignedLoadStores) {
10981       // align to 8 bytes, we know we are 4 byte aligned to start
10982       testptr(to, 4);
10983       jccb(Assembler::zero, L_fill_32_bytes);
10984       movl(Address(to, 0), value);
10985       addptr(to, 4);
10986       subl(count, 1<<shift);
10987     }
10988     BIND(L_fill_32_bytes);
10989     {
10990       assert( UseSSE >= 2, "supported cpu only" );
10991       Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
10992       // Fill 32-byte chunks
10993       movdl(xtmp, value);
10994       pshufd(xtmp, xtmp, 0);
10995 
10996       subl(count, 8 << shift);
10997       jcc(Assembler::less, L_check_fill_8_bytes);
10998       align(16);
10999 
11000       BIND(L_fill_32_bytes_loop);
11001 
11002       if (UseUnalignedLoadStores) {
11003         movdqu(Address(to, 0), xtmp);
11004         movdqu(Address(to, 16), xtmp);
11005       } else {
11006         movq(Address(to, 0), xtmp);
11007         movq(Address(to, 8), xtmp);
11008         movq(Address(to, 16), xtmp);
11009         movq(Address(to, 24), xtmp);
11010       }
11011 
11012       addptr(to, 32);
11013       subl(count, 8 << shift);
11014       jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
11015       BIND(L_check_fill_8_bytes);
11016       addl(count, 8 << shift);
11017       jccb(Assembler::zero, L_exit);
11018       jmpb(L_fill_8_bytes);
11019 
11020       //
11021       // length is too short, just fill qwords
11022       //
11023       BIND(L_fill_8_bytes_loop);
11024       movq(Address(to, 0), xtmp);
11025       addptr(to, 8);
11026       BIND(L_fill_8_bytes);
11027       subl(count, 1 << (shift + 1));
11028       jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
11029     }
11030   }
11031   // fill trailing 4 bytes
11032   BIND(L_fill_4_bytes);
11033   testl(count, 1<<shift);
11034   jccb(Assembler::zero, L_fill_2_bytes);
11035   movl(Address(to, 0), value);
11036   if (t == T_BYTE || t == T_SHORT) {
11037     addptr(to, 4);
11038     BIND(L_fill_2_bytes);
11039     // fill trailing 2 bytes
11040     testl(count, 1<<(shift-1));
11041     jccb(Assembler::zero, L_fill_byte);
11042     movw(Address(to, 0), value);
11043     if (t == T_BYTE) {
11044       addptr(to, 2);
11045       BIND(L_fill_byte);
11046       // fill trailing byte
11047       testl(count, 1);
11048       jccb(Assembler::zero, L_exit);
11049       movb(Address(to, 0), value);
11050     } else {
11051       BIND(L_fill_byte);
11052     }
11053   } else {
11054     BIND(L_fill_2_bytes);
11055   }
11056   BIND(L_exit);
11057 }
11058 #undef BIND
11059 #undef BLOCK_COMMENT
11060 
11061 
11062 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
11063   switch (cond) {
11064     // Note some conditions are synonyms for others
11065     case Assembler::zero:         return Assembler::notZero;
11066     case Assembler::notZero:      return Assembler::zero;
11067     case Assembler::less:         return Assembler::greaterEqual;
11068     case Assembler::lessEqual:    return Assembler::greater;
11069     case Assembler::greater:      return Assembler::lessEqual;
11070     case Assembler::greaterEqual: return Assembler::less;
11071     case Assembler::below:        return Assembler::aboveEqual;
11072     case Assembler::belowEqual:   return Assembler::above;
11073     case Assembler::above:        return Assembler::belowEqual;
11074     case Assembler::aboveEqual:   return Assembler::below;
11075     case Assembler::overflow:     return Assembler::noOverflow;
11076     case Assembler::noOverflow:   return Assembler::overflow;
11077     case Assembler::negative:     return Assembler::positive;
11078     case Assembler::positive:     return Assembler::negative;
11079     case Assembler::parity:       return Assembler::noParity;
11080     case Assembler::noParity:     return Assembler::parity;
11081   }
11082   ShouldNotReachHere(); return Assembler::overflow;
11083 }
11084 
11085 SkipIfEqual::SkipIfEqual(
11086     MacroAssembler* masm, const bool* flag_addr, bool value) {
11087   _masm = masm;
11088   _masm->cmp8(ExternalAddress((address)flag_addr), value);
11089   _masm->jcc(Assembler::equal, _label);
11090 }
11091 
11092 SkipIfEqual::~SkipIfEqual() {
11093   _masm->bind(_label);
11094 }